In [None]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(42)

In [None]:
def load_data(datafile):
    """ Utility function to load the data files with correct dtypes """
    data = pd.read_csv(
        datafile
    )
    return data

In [None]:
# Path to dataset
PATH = '/cdtshared/wearables/students/group5/'

# Features from biobank
features = load_data(PATH+'reduced-cohort.csv')

In [None]:
# identify the categorical features
categorical_features = []
for columns in features:
    if features[columns].dtype=='object':
        categorical_features.append(columns)

In [None]:
features_of_interest = list(set(list(features.columns)) - set(['Unnamed: 0', 'Participant ID']))
numeric_features = list(set(features_of_interest) - set(categorical_features))

Y = features['acc.overall.avg']
X = features[features_of_interest]

# Impute the missing values, mean for numeric, mode for categorical
# numeric columns
X.fillna(X[numeric_features].mean().iloc[0], inplace=True)

# categorical columns
X.fillna(X[categorical_features].mode().iloc[0], inplace=True)

In [None]:
# Check for any missing values
X[X.isnull().any(axis=1)]

In [None]:
# Export the dataset without missing values
dataset = pd.concat([X, Y], axis=1)
dataset.to_pickle("/cdtshared/wearables/students/group5/imputed_dataset.pkl")