In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection, linear_model, ensemble
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import sklearn.base as skb
from sklearn.impute import SimpleImputer


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/'Studies/Practical Machine Learning/Assignment 3'

Mounted at /content/drive
/content/drive/MyDrive/Studies/Practical Machine Learning/Assignment 3


In [None]:
# Function to load dataset and partition it into train, dev, and test sets
def load_dataset(filename):
    data = pd.read_pickle(filename, compression='infer')

    # Splitting dataset into train, dev, and test sets
    train_set = data['train'].drop('target', axis=1), data['train']['target']
    dev_set = data['dev'].drop('target', axis=1), data['dev']['target']
    test_set = data['test'].drop('target', axis=1), data['test']['target']

    return train_set, dev_set, test_set

train, dev, test = load_dataset('ass3.pickle')


In [29]:
# Show dataset details
print(f"Number of train samples: {train[0].shape[0]}")
print(f"Number of dev samples: {dev[0].shape[0]}")
print(f"Number of test samples: {test[0].shape[0]}")
print(f"Number of features: {train[0].shape[1]}")

print("\nFirst few records of the training dataset:")
print(train[0].head())
print("\nStatistical Summary:")
print(train[0].describe())
print("\nMissing Values:")
print(train[0].isnull().sum())

Number of train samples: 12384
Number of dev samples: 4128
Number of test samples: 4128
Number of features: 8

First few records of the training dataset:
           f0    f1        f2        f3      f4        f5     f6      f7
14981  4.0391  15.0  6.297710  0.992366   334.0  2.549618  32.72 -116.99
6614   4.7241  46.0  5.375758  0.954545   753.0  2.281818  34.17 -118.10
14233  3.3553   7.0  5.229213  1.101124  1304.0  2.930337  32.70 -117.01
1802   1.3929  52.0  5.000000  0.953488   126.0  2.930233  37.92 -122.36
6030   1.6006  52.0  4.427083  1.017361  1246.0  2.163194  34.07 -117.75

Statistical Summary:
                 f0            f1            f2            f3            f4  \
count  12210.000000  12244.000000  12226.000000  12228.000000  12215.000000   
mean       3.872771     28.630595      5.420978      1.096626   1426.830618   
std        1.919183     12.566127      2.382548      0.471398   1103.528284   
min        0.499900      1.000000      0.846154      0.500000      3.0

In [None]:
# Function to preprocess data: impute missing values and scale the features
def preprocess_data(train, dev, test):
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(train[0])

    train_imputed = imputer.transform(train[0]), train[1]
    dev_imputed = imputer.transform(dev[0]), dev[1]
    test_imputed = imputer.transform(test[0]), test[1]

    # Scaling the data
    scaler = preprocessing.StandardScaler().fit(train_imputed[0])
    train_scaled = scaler.transform(train_imputed[0]), train_imputed[1]
    dev_scaled = scaler.transform(dev_imputed[0]), dev_imputed[1]
    test_scaled = scaler.transform(test_imputed[0]), test_imputed[1]

    return train_scaled, dev_scaled, test_scaled


In [None]:
# Preprocess data
train_scaled, dev_scaled, test_scaled = preprocess_data(train, dev, test)


In [None]:
# Define function to train and validate models
def run_experiment(name, train, dev, regressors, metrics_dict):
    print(f'****** {name} ******')
    for regressor_name, regressor in regressors:
        model = skb.clone(regressor)
        # Perform 5-fold cross-validation
        cv_results = model_selection.cross_validate(model, *train, cv=5, scoring=metrics_dict, return_train_score=True)

        # Get mean of the scores from cross-validation for both train and test
        mean_train_scores = {name: np.mean(scores) for name, scores in cv_results.items() if name.startswith('train_')}
        mean_test_scores = {name: np.mean(scores) for name, scores in cv_results.items() if name.startswith('test_')}
        print(f'{regressor_name} model achieved {mean_train_scores} on training data and {mean_test_scores} on validation data')

        model.fit(*train)
        dev_pred = model.predict(dev[0])

        dev_scores = {name: scorer._score_func(dev[1], dev_pred) for name, scorer in metrics_dict.items()}
        print(f'Dev Results: {dev_scores}')


In [None]:
regressors = [
    ('LR', linear_model.LinearRegression()),
    ('KNR', KNeighborsRegressor()),
    ('SVR', SVR()),
    ('RFR', ensemble.RandomForestRegressor(n_estimators=400)),
    ('XGBR', XGBRegressor())
]


In [None]:
metrics_dict = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'MAE': make_scorer(mean_absolute_error),
    'R2': make_scorer(r2_score)
}


In [None]:
run_experiment("After scaling", train_scaled, dev_scaled, regressors, metrics_dict)


****** After scaling ******
LR model achieved {'train_MSE': -0.5462225250920477, 'train_MAE': 0.5435008895158063, 'train_R2': 0.5854222776391872} on training data and {'test_MSE': -0.5533573566623324, 'test_MAE': 0.5441936203890856, 'test_R2': 0.5797157883272115} on validation data
Dev Results: {'MSE': 0.522904334901607, 'MAE': 0.5231571750489642, 'R2': 0.6008414936534726}
KNR model achieved {'train_MSE': -0.3055871694785036, 'train_MAE': 0.37987147372654095, 'train_R2': 0.7680644462796943} on training data and {'test_MSE': -0.467385240865742, 'test_MAE': 0.47015424484436136, 'test_R2': 0.6451492193180905} on validation data
Dev Results: {'MSE': 0.44115472085138757, 'MAE': 0.4567328284883721, 'R2': 0.6632449806026308}
SVR model achieved {'train_MSE': -0.379053913330189, 'train_MAE': 0.4078369025808728, 'train_R2': 0.7122964087306171} on training data and {'test_MSE': -0.3935236972857756, 'test_MAE': 0.4196603163135103, 'test_R2': 0.7011749255289784} on validation data
Dev Results: {'MS

In [None]:
param_grid = {
    'n_estimators': [400, 800],
    'max_depth': [20,30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rfr = ensemble.RandomForestRegressor()

grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

grid_search.fit(train_scaled[0], train_scaled[1])


Fitting 3 folds for each of 54 candidates, totalling 162 fits




In [28]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Using the best model
best_grid = grid_search.best_estimator_

# Predicting on the dev set
dev_pred = best_grid.predict(dev_scaled[0])

mse = mean_squared_error(dev_scaled[1], dev_pred)
mae = mean_absolute_error(dev_scaled[1], dev_pred)
r2 = r2_score(dev_scaled[1], dev_pred)
print(f"MSE of the best model on dev set: {mse}")
print(f"MAE of the best model on dev set: {mae}")
print(f"R2 score of the best model on dev set: {r2}")

# Evaluate on the test set
test_pred = best_grid.predict(test_scaled[0])
mse_test = mean_squared_error(test_scaled[1], test_pred)
mae_test = mean_absolute_error(test_scaled[1], test_pred)
r2_test = r2_score(test_scaled[1], test_pred)


Best parameters: {'max_depth': 40, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 800}
MSE of the best model on dev set: 0.267295816013345
MAE of the best model on dev set: 0.34064462633023634
R2 score of the best model on dev set: 0.7959600034820153
