In [1]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

In [2]:
def dataset_split(X, Y, ratio):
    """ Function to split the dataset into train and test """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=ratio)
    return X_train, X_test, y_train, y_test

def score(y_true,y_pred): 
    """ Function to print the metrics of interest of the model """
    mse = mean_squared_error(y_true, y_pred) #set score here and not below if using MSE in GridCV
    r2 = r2_score(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)
    print("MSE is: ", mse)
    print("R2 is: ", r2)
    print("Explained variance is:", ev)
    
def model_tune(model_name, params, X_train, y_train):
    if model_name == 'knn':
        model = KNeighborsRegressor(algorithm='auto')
    elif model_name == 'rf':
        model = RandomForestRegressor()
    elif model_name == 'regression':
        model = ElasticNet()
    else:
        print('Model unrecognised')
    # Tune the model with Bayesian optimisation
    opt = BayesSearchCV(model, param_grid, n_iter=30, cv=5, verbose=1)
    opt.fit(X_train, y_train)
    # With the following parameter combination being optimal
    print("Best parameter combo:", opt.best_params_)
    # Having the following score
    print("Best validation MSE:", opt.best_score_)
    return opt.best_estimator_

# Load the data

In [3]:
# Path to dataset
PATH = '/cdtshared/wearables/students/group5/'

# Features from biobank
features = pd.read_pickle(PATH+"imputed_dataset.pkl")

In [4]:
features_of_interest = list(set(list(features.columns)) - set(['acc.overall.avg']))

Y = features['acc.overall.avg']
X = features[features_of_interest]

In [5]:
# identify the categorical features
categorical_features = []
for columns in list(X.columns):
    if features[columns].dtype=='object':
        categorical_features.append(columns)

# Data preparation for training

In [6]:
# Encode the categorical variables
X_enc = pd.get_dummies(X, columns=categorical_features)

In [7]:
Y = Y.to_numpy()

In [8]:
# Split into training and testing, 70:20:10
X_train, X_test, y_train, y_test = train_test_split(X_enc, Y, test_size=0.3)

# Split into training and validation
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.1/(0.1 + 0.2))

print("Shape of training set:", X_train.shape)
print("Shape of validation set:", X_val.shape)
print("Shape of test set:", X_test.shape)

Shape of training set: (69589, 1139)
Shape of validation set: (19882, 1139)
Shape of test set: (9942, 1139)


# Model Tuning

In [9]:
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for kNN hyperparameters
# Number of neighbours in kNN
n_neighbors = [3, 5, 7, 10]
# Leaf size passed to BallTree or KDTree
leaf_size = [1, 20, 30, 40]
# Whether using Minkowski or Euclidean distance
p = [1, 2]
# How to weigh the distance proximity
weights = ['uniform', 'distance']
# The distance metric to use for the tree.
metric = ['minkowski', 'chebyshev']

# Create the grid
param_grid = {'n_neighbors': n_neighbors,
               'leaf_size': leaf_size,
               'p': p,
               'weights': weights,
               'metric': metric}

In [10]:
# Define the model you are interested in
model = KNeighborsRegressor(algorithm='auto')

In [11]:
# Create the fold corresponding to our own train and validation split
X = np.vstack((X_train, X_val))
test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_val.shape[0])]
y = np.concatenate([y_train, y_val])
ps = PredefinedSplit(test_fold)

In [None]:
# Skip to bayesian below if taking too long to compute
clf = GridSearchCV(model, param_grid, cv=ps, refit=False)
clf.fit(X, y)

In [None]:
# With the following parameter combination being optimal
print("Best parameter combo:", clf.best_params_)
# Having the following score
print("Best validation MSE:", clf.best_score_)

In [None]:
# Get the test set performance
score(y_test, model_best.predict(X_test))

In [12]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(model, param_grid, n_iter=30, cv=ps, verbose=1, refit=False)
opt.fit(X, y)

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fi

BayesSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
              estimator=KNeighborsRegressor(), n_iter=30, refit=False,
              search_spaces={'leaf_size': [1, 20, 30, 40],
                             'metric': ['minkowski', 'chebyshev'],
                             'n_neighbors': [3, 5, 7, 10], 'p': [1, 2],
                             'weights': ['uniform', 'distance']},
              verbose=1)

In [13]:
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best validation MSE:", opt.best_score_)

Best parameter combo: OrderedDict([('leaf_size', 1), ('metric', 'minkowski'), ('n_neighbors', 10), ('p', 2), ('weights', 'distance')])
Best validation MSE: -6.225981106621581e-02


Best parameters: <br>
**leaf_size**:1 <br>
**metric**: minkowski <br>
**n_neighbors**: 10 <br>
**p**: 2 <br>
**weights**: distance <br>

In [15]:
model = KNeighborsRegressor(algorithm='auto', leaf_size=1, metric='minkowski', n_neighbors=10, p=2, weights='distance')
model.fit(X_train, y_train)

KNeighborsRegressor(leaf_size=1, n_neighbors=10, weights='distance')

In [20]:
score(y_val, model.predict(X_val))

MSE is:  95.89231094639497
R2 is:  -0.062259811066194715
Explained variance is: -0.06215603125256419


In [21]:
# Get the test set performance
score(y_test, model.predict(X_test))

MSE is:  105.02607930506684
R2 is:  -0.19641985837995635
Explained variance is: -0.19620384288202142
