In [78]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

# For reproducibility
np.random.seed(42)

In [79]:
def load_data(datafile):
    """ Utility function to load the data files with correct dtypes """
    data = pd.read_csv(
        datafile
    )
    return data

def dataset_split(X, Y, ratio):
    """ Function to split the dataset into train and test """
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=ratio)
    return X_train, X_test, y_train, y_test

def score(y_true,y_pred): 
    """ Function to print the metrics of interest of the model """
    mse = mean_squared_error(y_true, y_pred) #set score here and not below if using MSE in GridCV
    r2 = r2_score(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)
    print("MSE is: ", mse)
    print("R2 is: ", r2)
    print("Explained variance is:", ev)
    
def model_tune(model_name, params, X_train, y_train):
    if model_name == 'knn':
        model = KNeighborsRegressor(algorithm='auto')
    elif model_name == 'rf':
        model = RandomForestRegressor()
    else:
        print('Model unrecognised')
    # Tune the model with Bayesian optimisation
    opt = BayesSearchCV(model, param_grid, n_iter=30, cv=5, verbose=1)
    opt.fit(X_train, y_train)
    # With the following parameter combination being optimal
    print("Best parameter combo:", opt.best_params_)
    # Having the following score
    print("Best validation MSE:", opt.best_score_)
    return opt.best_estimator_

# Load the data

In [66]:
# Path to dataset
PATH = '/cdtshared/wearables/health_data_files/'

# Features from biobank
features = load_data(PATH+'dataset-with-preprocessing-done.csv')

features_of_interest = ['age_entry_years', 'sex', 'smoking', 'BMI', 'inc_ihd']
categorical_features = ['sex', 'smoking']
numeric_features = list(set(features_of_interest) - set(categorical_features))
Y = features['acc.overall.avg']
X = features[features_of_interest]

#Check for null values
X[X.isnull().any(axis=1)]

Content of /cdtshared/wearables/health_data_files/
['death_cause.txt', 'base_cohort_wearables.csv', 'accelerometer.csv', 'death.txt', 'dataset-with-preprocessing-done.csv', 'hesin_all.csv']


Unnamed: 0,age_entry_years,sex,smoking,BMI,inc_ihd


# Data preparation for training

In [67]:
# Encode the categorical variables
X_enc = pd.get_dummies(X, columns=categorical_features)

In [68]:
Y = Y.to_numpy()

In [70]:
# Split into training and testing, 80:20
X_train, X_test, y_train, y_test = train_test_split(X_enc, Y, test_size=0.2)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

Shape of training set: (72994, 8)
Shape of test set: (18249, 8)


# Model Tuning

In [74]:
# Define the hyperparameters you want to sweep through (important it is manual for generalisation)
# C for regularisation if doing regression
# kernel if doing SVM for example

# In this case we are tuning for RF hyperparameters
# Number of trees in random forest
n_estimators = [10, 25, 50, 100, 150]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10, 50]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [75]:
# Define the model you are interested in
model = RandomForestRegressor()

In [76]:
# Skip to bayesian below if taking too long to compute
clf = GridSearchCV(model, param_grid, cv=5)
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Get the best model
model_best = clf.best_estimator_
# With the following parameter combination being optimal
print("Best parameter combo:", clf.best_params_)
# Having the following score
print("Best validation MSE:", clf.best_score_)

In [None]:
# Get the test set performance
score(model_best.predict(X_test), y_test)

In [81]:
# Try with Bayesian optimisation for faster computation of tuning
opt = BayesSearchCV(model, param_grid, n_iter=30, cv=5, verbose=1)
opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


BayesSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=30,
              search_spaces={'bootstrap': [True, False],
                             'max_depth': [10, 50, None],
                             'max_features': ['auto', 'sqrt'],
                             'min_samples_leaf': [2, 4, 10],
                             'min_samples_split': [5, 10],
                             'n_estimators': [10, 25, 50, 100, 150]},
              verbose=1)

In [84]:
# Get the best model
model_best = opt.best_estimator_
# With the following parameter combination being optimal
print("Best parameter combo:", opt.best_params_)
# Having the following score
print("Best validation MSE:", opt.best_score_)

Best parameter combo: OrderedDict([('bootstrap', True), ('max_depth', 10), ('max_features', 'sqrt'), ('min_samples_leaf', 10), ('min_samples_split', 5), ('n_estimators', 150)])
Best validation MSE: 0.1385479767331151


Best parameters: <br>
**bootstrap**:True <br>
**max_depth**: 10 <br>
**max_features**: 'sqrt' <br>
**min_samples_leaf**: 10 <br>
**min_samples_split**: 5 <br>
**n_estimators**: 150 <br>

In [85]:
# Get the test set performance
score(model_best.predict(X_test), y_test)

MSE is:  56.25138955734784
R2 is:  -5.044278570442262
Explained variance is: -5.043435824514083
