In [9]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

In [2]:
def load_data(datafile):
    """ Utility function to load the data files with correct dtypes """
    data = pd.read_csv(
        datafile
    )
    return data

def dataset_split(X, Y, ratio):
    """ Function to split the dataset into train and test """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=ratio)
    return X_train, X_test, y_train, y_test

def score(y_true,y_pred): 
    """ Function to print the metrics of interest of the model """
    mse = mean_squared_error(y_true, y_pred) #set score here and not below if using MSE in GridCV
    r2 = r2_score(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)
    print("MSE is: ", mse)
    print("R2 is: ", r2)
    print("Explained variance is:", ev)
    
def model_tune(model_name, params, X_train, y_train):
    if model_name == 'knn':
        model = KNeighborsRegressor(algorithm='auto')
    elif model_name == 'rf':
        model = RandomForestRegressor()
    elif model_name == 'regression':
        model = ElasticNet()
    else:
        print('Model unrecognised')
    # Tune the model with Bayesian optimisation
    opt = BayesSearchCV(model, param_grid, n_iter=30, cv=5, verbose=1)
    opt.fit(X_train, y_train)
    # With the following parameter combination being optimal
    print("Best parameter combo:", opt.best_params_)
    # Having the following score
    print("Best validation MSE:", opt.best_score_)
    return opt.best_estimator_

# Load the data

In [3]:
# Path to dataset
PATH = '/cdtshared/wearables/health_data_files/'

# Features from biobank
features = load_data(PATH+'dataset-with-preprocessing-done.csv')

features_of_interest = ['age_entry_years', 'sex', 'smoking', 'BMI', 'inc_ihd']
categorical_features = ['sex', 'smoking']
numeric_features = list(set(features_of_interest) - set(categorical_features))
Y = features['acc.overall.avg']
X = features[features_of_interest]

#Check for null values
X[X.isnull().any(axis=1)]

Unnamed: 0,age_entry_years,sex,smoking,BMI,inc_ihd


# Data preparation for training

In [4]:
# Encode the categorical variables
X_enc = pd.get_dummies(X, columns=categorical_features)

In [5]:
Y = Y.to_numpy()

In [6]:
# Split into training and testing, 80:20
X_train, X_test, y_train, y_test = train_test_split(X_enc, Y, test_size=0.2)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

Shape of training set: (72994, 8)
Shape of test set: (18249, 8)


# Model Tuning

In [7]:
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for kNN hyperparameters
# Number of neighbours in kNN
n_neighbors = [3, 5, 7, 10]
# Leaf size passed to BallTree or KDTree
leaf_size = [1, 20, 30, 40]
# Whether using Minkowski or Euclidean distance
p = [1, 2]
# How to weigh the distance proximity
weights = ['uniform', 'distance']
# The distance metric to use for the tree.
metric = ['minkowski', 'chebyshev']

# Create the grid
param_grid = {'n_neighbors': n_neighbors,
               'leaf_size': leaf_size,
               'p': p,
               'weights': weights,
               'metric': metric}

In [10]:
# Define the model you are interested in
model = KNeighborsRegressor(algorithm='auto')

In [15]:
# Skip to bayesian below if taking too long to compute
clf = GridSearchCV(model, param_grid, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'leaf_size': [1, 20, 30, 40],
                         'metric': ['minkowski', 'chebyshev'],
                         'n_neighbors': [3, 5, 7, 10], 'p': [1, 2],
                         'weights': ['uniform', 'distance']})

In [18]:
# Get the best model
model_best = clf.best_estimator_
# With the following parameter combination being optimal
print("Best parameter combo:", clf.best_params_)
# Having the following score
print("Best validation MSE:", clf.best_score_)

Best parameter combo: {'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}
Best validation MSE: 0.062439485405776395


In [19]:
# Get the test set performance
score(y_test, model_best.predict(X_test))

MSE is:  61.86407733705071
R2 is:  -2.975302502760989
Explained variance is: -2.974453448169699


In [12]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(model, param_grid, n_iter=30, cv=5, verbose=1)
opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


BayesSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=30,
              search_spaces={'leaf_size': [1, 20, 30, 40],
                             'metric': ['minkowski', 'chebyshev'],
                             'n_neighbors': [3, 5, 7, 10], 'p': [1, 2],
                             'weights': ['uniform', 'distance']},
              verbose=1)

In [13]:
# Get the best model
model_best = opt.best_estimator_
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best validation MSE:", opt.best_score_)

Best parameter combo: OrderedDict([('leaf_size', 40), ('metric', 'minkowski'), ('n_neighbors', 10), ('p', 2), ('weights', 'uniform')])
Best validation MSE: 6.2439485405776395e-02


Best parameters: <br>
**leaf_size**:40 <br>
**metric**: minkowski <br>
**n_neighbors**: 10 <br>
**p**: 2 <br>
**weights**: uniform <br>

In [14]:
# Get the test set performance
score(y_test, model_best.predict(X_test))

MSE is:  61.86407733705071
R2 is:  -2.975302502760989
Explained variance is: -2.974453448169699
