In [7]:
# Data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    accuracy_score,
    classification_report,
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Artificial Neural Networks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Explainability tools
import shap
from lime.lime_tabular import LimeTabularExplainer

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# For Jupyter notebook
%matplotlib inline


In [8]:
# Load Housing Dataset
housing = pd.read_csv('datasets/housing.csv')

# Display the first few rows
housing.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [9]:
# Check for target column
if 'MEDV' in housing.columns:
    target_column = 'MEDV'
elif 'Price' in housing.columns:
    target_column = 'Price'
else:
    print("Available columns:", housing.columns.tolist())
    target_column = input("Enter the target column name from the above list: ")

# Separate features and target
X_housing = housing.drop(columns=[target_column])
y_housing = housing[target_column]

# Display shapes
print(f"Features shape: {X_housing.shape}")
print(f"Target shape: {y_housing.shape}")


Features shape: (506, 13)
Target shape: (506,)


In [10]:
# Load Vehicles Dataset
vehicles = pd.read_csv('datasets/vehicles.csv')

# Display the first few rows
vehicles.head()


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [12]:
# Check for target column
if 'mpg' in vehicles.columns:
    target_column = 'mpg'
elif 'emission' in vehicles.columns:
    target_column = 'emission'
else:
    print("Available columns:", vehicles.columns.tolist())
    target_column = input("Enter the target column name from the above list: ")

# Separate features and target
X_vehicles = vehicles.drop(columns=[target_column])
y_vehicles = vehicles[target_column]

# Handle categorical variables if any
X_vehicles = pd.get_dummies(X_vehicles, drop_first=True)

# Display shapes
print(f"Features shape: {X_vehicles.shape}")
print(f"Target shape: {y_vehicles.shape}")


Available columns: ['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08', 'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2', 'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U', 'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders', 'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08', 'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA', 'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD', 'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make', 'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity', 'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA', 'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr', 'createdOn', 'modifiedOn', 'startStop', 'phevCity', 'phevHwy', 'phevComb']
Features shape: (40081, 561

In [13]:
# Load column names from wdbc.names
column_names = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]

# Read the data
breast_cancer = pd.read_csv('datasets/wdbc.data', header=None, names=column_names)

# Drop the 'ID' column
breast_cancer.drop('ID', axis=1, inplace=True)

# Map 'Diagnosis' to binary values ('M' for malignant, 'B' for benign)
breast_cancer['Diagnosis'] = breast_cancer['Diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X_cancer = breast_cancer.drop('Diagnosis', axis=1)
y_cancer = breast_cancer['Diagnosis']

# Display shapes
print(f"Features shape: {X_cancer.shape}")
print(f"Target shape: {y_cancer.shape}")


Features shape: (569, 30)
Target shape: (569,)


In [14]:
def preprocess_data(X, y, task="regression"):
    # Handle missing values
    X = X.dropna()
    y = y.loc[X.index]

    # Ensure all data is numeric
    if not np.issubdtype(X.dtypes.values[0], np.number):
        X = pd.get_dummies(X, drop_first=True)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return (
        X_train,
        X_test,
        y_train,
        y_test,
        X_train_scaled,
        X_test_scaled,
        scaler,
    )


In [15]:
def train_linear_model(X_train, y_train, task="regression"):
    if task == "regression":
        model = LinearRegression()
    else:
        model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model


In [16]:
def train_random_forest(X_train, y_train, task="regression"):
    if task == "regression":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    else:
        model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model


In [17]:
def train_ann_model(X_train_scaled, y_train, task="regression"):
    model = Sequential()
    model.add(Dense(64, activation="relu", input_shape=(X_train_scaled.shape[1],)))
    model.add(Dense(32, activation="relu"))
    if task == "regression":
        model.add(Dense(1))
        model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    else:
        model.add(Dense(1, activation="sigmoid"))
        model.compile(
            optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
        )
    model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, verbose=0)
    return model
