In [1]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

In [2]:
def load_data(datafile):
    """ Utility function to load the data files with correct dtypes """
    data = pd.read_csv(
        datafile
    )
    return data

In [3]:
# Path to dataset
PATH = '/cdtshared/wearables/students/group5/'

# Features from biobank
features = load_data(PATH+'reduced-cohort.csv')

In [4]:
# Check for any missing values
features[features.isnull().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,acc.overall.avg,Participant ID,MET minutes per week for vigorous activity | Instance 0,Tea intake | Instance 0,Sex,Type of accommodation lived in | Instance 0,Duration of walks | Instance 0,Frequency of stair climbing in last 4 weeks | Instance 0,Getting up in morning | Instance 0,...,Attendance/disability/mobility allowance | Instance 0,Ethnic background | Instance 0,Crime score,Education score,Employment score,Health score,Housing score,Income score,Index of Multiple Deprivation,Living environment
0,0,24.20244,1000052,960.0,4,Female,A house or bungalow,40,1-5 times a day,Fairly easy,...,None of the above,British,-0.55,23.89,0.12,-0.33,6.90,0.19,18.76,8.70
4,4,18.83354,1000384,3600.0,2,Male,A house or bungalow,20,11-15 times a day,Very easy,...,None of the above,Any other white background,-1.18,1.49,0.06,-0.32,5.69,0.04,4.60,5.79
7,7,24.84570,1000508,360.0,0,Male,A house or bungalow,,1-5 times a day,Fairly easy,...,None of the above,Any other white background,0.92,22.55,0.14,0.70,33.87,0.31,38.32,38.68
9,9,20.47546,1000582,,3,Female,A house or bungalow,180,6-10 times a day,Not very easy,...,None of the above,British,-1.96,3.57,0.13,0.25,18.61,0.07,13.42,1.32
10,10,29.43438,1000617,0.0,4,Female,A house or bungalow,20,11-15 times a day,Not very easy,...,None of the above,British,-0.24,2.04,0.03,-0.68,25.73,0.02,5.77,12.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99407,103635,23.51913,6024948,0.0,3,Female,A house or bungalow,20,More than 20 times a day,Fairly easy,...,None of the above,British,0.16,0.02,0.02,-0.81,29.96,0.03,6.68,11.52
99408,103636,25.19081,6024961,160.0,7,Male,A house or bungalow,60,6-10 times a day,Fairly easy,...,None of the above,British,1.06,18.76,0.12,0.90,12.93,0.18,28.20,25.69
99410,103639,57.62625,6025037,3360.0,Less than one,Male,A house or bungalow,30,16-20 times a day,Fairly easy,...,None of the above,British,-1.14,6.33,0.04,-0.63,15.89,0.04,5.46,18.59
99411,103640,34.77740,6025041,1440.0,5,Female,A house or bungalow,45,6-10 times a day,Not very easy,...,None of the above,British,-2.08,1.57,0.04,-1.04,29.54,0.04,5.27,5.08


In [5]:
features['MET minutes per week for vigorous activity | Instance 0'][99412], features['Body mass index (BMI) | Instance 0'][103]

(nan, nan)

In [6]:
# identify the categorical features
categorical_features = []
for columns in features:
    if features[columns].dtype=='object':
        categorical_features.append(columns)

In [7]:
features_of_interest = list(set(list(features.columns)) - set(['Unnamed: 0', 'Participant ID']))
numeric_features = list(set(features_of_interest) - set(categorical_features))

Y = features['acc.overall.avg']
X = features[features_of_interest]

# Impute the missing values, mean for numeric, mode for categorical
for col in X.columns:
    if col in numeric_features:
        # numeric columns
        X[col].fillna(X[col].mean(), inplace=True)
    elif col in categorical_features:
        # categorical columns
        X[col].fillna(X[col].mode().iloc[0], inplace=True)

In [8]:
# Check for any missing values
X[X.isnull().any(axis=1)]

Unnamed: 0,Chest pain or discomfort | Instance 0,Age started wearing glasses or contact lenses | Instance 0,Employment score,Frequency of tiredness / lethargy in last 2 weeks | Instance 0,Attendance/disability/mobility allowance | Instance 0,Worrier / anxious feelings | Instance 0,Mouth/teeth dental problems | Instance 0,Index of Multiple Deprivation,Age at recruitment,Salad / raw vegetable intake | Instance 0,...,Ethnic background | Instance 0,Length of mobile phone use | Instance 0,Tea intake | Instance 0,Breastfed as a baby | Instance 0,Diabetes diagnosed by doctor | Instance 0,Number of vehicles in household | Instance 0,Above moderate/vigorous recommendation | Instance 0,Getting up in morning | Instance 0,Frequency of stair climbing in last 4 weeks | Instance 0,Townsend deprivation index at recruitment


In [9]:
X['MET minutes per week for vigorous activity | Instance 0'][99412], X['Body mass index (BMI) | Instance 0'][103]

(667.5238248625268, 26.73620294093929)

In [10]:
# Export the dataset without missing values
dataset = pd.concat([X, Y], axis=1)
dataset.to_pickle("/cdtshared/wearables/students/group5/imputed_dataset.pkl")