# Models

### ⬇️ Step 1: Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib import Path
from ipywidgets import interact
import ipywidgets as widgets

# Visualization parameters
%matplotlib inline
a4_dims = (9.7, 3.27)
plt.rcParams['figure.figsize'] = (7, 5)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 500
plt.rcParams["figure.autolayout"] = True
sns.set_theme()

### 🍱 Step 2: Load Development Data

In [53]:
baseline_c_stop_go = pd.read_csv("../output/dev_baseline_c_stop_go.csv")
baseline_i_stop_go = pd.read_csv("../output/dev_baseline_i_stop_go.csv")
followup_c_stop_go = pd.read_csv("../output/dev_followup_c_stop_go.csv")
followup_i_stop_go = pd.read_csv("../output/dev_followup_i_stop_go.csv")

c_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(149, 297)]
i_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(297, 444 + 1)]

# 0 = Male, 1 = Female
baseline_c_stop_go["demo_sex_v2"] = baseline_c_stop_go["demo_sex_v2"].replace({1.0: 0, 2.0: 1})
baseline_i_stop_go["demo_sex_v2"] = baseline_i_stop_go["demo_sex_v2"].replace({1.0: 0, 2.0: 1})
followup_c_stop_go["demo_sex_v2"] = followup_c_stop_go["demo_sex_v2"].replace({1.0: 0, 2.0: 1})
followup_i_stop_go["demo_sex_v2"] = followup_i_stop_go["demo_sex_v2"].replace({1.0: 0, 2.0: 1})

dataframes = [baseline_c_stop_go, baseline_i_stop_go, followup_c_stop_go, followup_i_stop_go]

# Drop rows with missing values (row number 2944, 5046)
baseline_c_stop_go = baseline_c_stop_go.drop([2944, 5046], axis=0)

In [54]:
baseline_c_stop_go

Unnamed: 0,src_subject_id,eventname,tfsstabwdp_149,tfsstabwdp_150,tfsstabwdp_151,tfsstabwdp_152,tfsstabwdp_153,tfsstabwdp_154,tfsstabwdp_155,tfsstabwdp_156,...,race_ethnicity,acs_raked_propensity_score,site_id_l,rel_family_id,rel_birth_id,school_id,district_id,interview_date,interview_age,visit_type
0,NDAR_INV00CY2MDM,baseline_year_1_arm_1,-0.072277,0.147229,0.122712,0.019083,0.259733,0.001988,0.025479,0.084555,...,1.0,1433.061575,site20,5355.0,53551.0,,,08/22/2017,130.0,1.0
1,NDAR_INV00HEV6HB,baseline_year_1_arm_1,-0.557458,-0.126797,-0.166221,0.008992,-0.089459,0.156079,-0.171020,-0.081916,...,2.0,650.876929,site12,2257.0,22571.0,,,07/08/2017,124.0,1.0
2,NDAR_INV00U4FTRU,baseline_year_1_arm_1,0.161269,-0.371111,-0.111674,-0.034581,0.160800,0.180306,-0.031762,-0.043941,...,5.0,1778.916737,site04,2464.0,24641.0,,6815.0,05/19/2018,130.0,1.0
3,NDAR_INV00X2TBWJ,baseline_year_1_arm_1,0.032588,0.270389,-0.188393,0.166681,0.422957,0.122249,0.181219,0.045643,...,3.0,907.279771,site14,3692.0,36921.0,,13889.0,05/12/2017,130.0,1.0
4,NDAR_INV010ZM3H9,baseline_year_1_arm_1,-2.946458,-1.448809,1.205950,0.333213,-5.486657,-0.247205,0.756608,-0.225100,...,1.0,550.058750,site12,1862.0,18621.0,,14043.0,05/05/2018,112.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,NDAR_INVWJ6WE11U,baseline_year_1_arm_1,0.116447,0.125323,0.021918,0.131663,-0.027328,0.071242,0.086107,0.052696,...,1.0,648.201245,site19,7223.0,72232.0,,,11/14/2017,118.0,1.0
5630,NDAR_INVWJF0H153,baseline_year_1_arm_1,0.245009,0.205302,-0.070323,0.053832,0.081435,0.145198,0.168372,0.139038,...,1.0,414.643009,site21,5282.0,52821.0,1712.0,7923.0,02/20/2018,116.0,1.0
5631,NDAR_INVWJJGB131,baseline_year_1_arm_1,0.127794,0.216799,0.058234,0.183327,0.301677,0.227671,0.215983,0.109781,...,3.0,1319.816277,site11,8726.0,87261.0,,,12/06/2017,107.0,1.0
5632,NDAR_INVWJTDFK2X,baseline_year_1_arm_1,0.055666,0.036851,0.039585,0.087713,0.042274,-0.012551,0.109308,0.091458,...,2.0,904.238929,site05,810.0,8101.0,,4079.0,06/15/2018,121.0,1.0


In [55]:
c_stop_go_features = ["tfsstabwdp_" + str(i) for i in range(149, 297)]

X = baseline_c_stop_go[c_stop_go_features]
y = baseline_c_stop_go["demo_sex_v2"]

# Check for null values
X[X.isnull().any(axis=1)]

Unnamed: 0,tfsstabwdp_149,tfsstabwdp_150,tfsstabwdp_151,tfsstabwdp_152,tfsstabwdp_153,tfsstabwdp_154,tfsstabwdp_155,tfsstabwdp_156,tfsstabwdp_157,tfsstabwdp_158,...,tfsstabwdp_287,tfsstabwdp_288,tfsstabwdp_289,tfsstabwdp_290,tfsstabwdp_291,tfsstabwdp_292,tfsstabwdp_293,tfsstabwdp_294,tfsstabwdp_295,tfsstabwdp_296


In [56]:
print("X shape: ", X.shape)
print("Y shape: ", y.shape)

X shape:  (5632, 148)
Y shape:  (5632,)


In [60]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

names = [
     "SVC",
     "RandomForestClassifier",
     "KNeighborsClassifier",
     "MLPClassifier",
     "LogisticRegression"
]

standard_models = [
    SVC(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    MLPClassifier(hidden_layer_sizes=(50,50), max_iter=600, random_state=42),
    LogisticRegression(penalty="l2"),
]

def run_models(title, X_train, y_train):
    
    print(f"Running {title} experiment")
    print("=============================================")
    
    for name, model in zip(names, standard_models):
        print(f"Trying {name}...")
        model.fit(X_train, y_train)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        average_cv_score = cv_scores.mean()
        print(f"\tAverage score: {average_cv_score*100:.3f}%")

## No preprocessing whatsoever

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

run_models("No preprocessing", X_train, y_train)

Running No preprocessing experiment
Trying SVC...
	Average score: 56.426%
Trying RandomForestClassifier...
	Average score: 54.406%
Trying KNeighborsClassifier...
	Average score: 51.743%
Trying MLPClassifier...
	Average score: 53.185%
Trying LogisticRegression...
	Average score: 57.114%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Z-score normalization

In [65]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().set_output(transform="pandas")

X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

run_models("Z-score normalization", X_train, y_train)

Running Z-score normalization experiment
Trying SVC...
	Average score: 56.426%
Trying RandomForestClassifier...
	Average score: 54.118%
Trying KNeighborsClassifier...
	Average score: 51.743%
Trying MLPClassifier...
	Average score: 53.185%
Trying LogisticRegression...
	Average score: 57.114%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## PCA

In [64]:
from sklearn.decomposition import PCA

In [78]:
pca_model = PCA(n_components = 0.99)
principalComponents_x = pca_model.fit_transform(X_scaled)

In [80]:
principal_x_Df = pd.DataFrame(data = principalComponents_x)
#columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4','principal component 5', 'principal component 6','principal component 7', 'principal component 8']
principal_x_Df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,-2.438902,-0.081381,0.30257,1.660215,-0.730309,-1.832513,2.163258,-1.683594,0.425666,-0.900723,...,0.263147,-0.188126,0.504105,-0.494976,-0.156791,-0.584291,0.206979,0.027041,0.085427,-0.60638
1,6.686924,-1.701196,-2.8036,-2.595759,-0.256681,1.771217,0.146556,0.66908,2.698933,0.28687,...,0.274596,-0.162548,-0.066379,-1.0073,0.381069,0.307679,-0.1571,0.550337,-0.139428,0.158223
2,6.34878,-3.661005,-3.001991,-0.205058,-0.487784,1.396309,3.775856,-1.518023,2.298411,0.294357,...,0.293397,0.239663,-0.285015,0.002206,0.154415,-0.188736,-0.235095,0.117004,-0.131533,-0.047712
3,-3.97732,-0.653434,-4.695355,2.652876,-2.693426,-2.19706,-1.440089,-0.042601,-0.158821,-0.494799,...,-0.552558,-0.496398,-0.638031,0.048161,0.456548,-0.007385,-0.246985,-0.118537,0.321201,-0.056139
4,-9.388287,17.536073,2.996492,-6.265555,10.109628,5.318212,7.460991,-4.742807,7.150336,-9.810679,...,0.777824,-1.585909,-1.055587,0.904424,-0.68315,-1.001465,0.164643,-2.497156,-0.592113,-0.717085


In [81]:
X_train, X_test, y_train, y_test = train_test_split(principalComponents_x, y, test_size=0.2, random_state=42)

run_models("PCA", X_train, y_train)

Running PCA experiment
Trying SVC...
	Average score: 56.781%
Trying RandomForestClassifier...
	Average score: 55.250%
Trying KNeighborsClassifier...
	Average score: 52.697%
Trying MLPClassifier...
	Average score: 54.806%
Trying LogisticRegression...
	Average score: 57.603%
