In [1]:
import pandas as pd


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import joblib
import xlsxwriter
import os


In [3]:
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score

In [4]:
def load_data_old():
    print('load old files')
    df_categorical = pd.read_excel('TRAIN_OLD/TRAIN_CATEGORICAL_METADATA.xlsx')
    df_matrices = pd.read_csv('TRAIN_OLD/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant = pd.read_excel('TRAIN_OLD/TRAIN_QUANTITATIVE_METADATA.xlsx')
    df_solutions = pd.read_excel('TRAIN_OLD/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel('TEST/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv('TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel('TEST/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [5]:
def load_data_new():
    print('load new files')
    df_categorical = pd.read_excel('TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx')
    df_matrices = pd.read_csv('TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
    df_quant = pd.read_excel('TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    df_solutions = pd.read_excel('TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel('TEST/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv('TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel('TEST/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [6]:
def join_data(categorical,matrices,quantitative,solutions=None):
    print('joining data frames')
    cat_quant = pd.merge(categorical, quantitative , on ='participant_id', how ='inner')
    cat_quant_mat = pd.merge(cat_quant, matrices , on ='participant_id', how ='inner')
    if isinstance(solutions, pd.DataFrame):
        cat_quant_mat_sols = pd.merge(cat_quant_mat, solutions , on ='participant_id', how ='inner')
        return cat_quant_mat_sols
    else:    
        return cat_quant_mat

In [7]:
def save_model(model,name,accuracy,y_test,y_pred):
    joblib.dump({'model': model, 'y_test': y_test,'y_pred': y_pred, 'accuracy': accuracy}, name)
    print('saved the model')

def save_exploration(model,name):
    joblib.dump({'model': model}, name)
    print('save_exploration')

def get_model(name):
    model = joblib.load(name)
    print(model)
    return model

In [8]:
def loaddata_joinframes_splittestandtrain(df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new):
    joined_training_data = join_data(df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new)
    X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
    Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]
    X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)
    return X_train_data, X_test_data, y_train_data, y_test_data

### Reference for parts of below sections taken from datathon slides
### https://colab.research.google.com/drive/1texL3JnRdTHyevP3_GzousIFKjTj0LmY#scrollTo=IGFYIQSmlUMb

In [9]:
def xgboost_classifer():
    print('xgboost_classifer')
    # Initialize the base classifier
    classifier = XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5)
    multioutput_classifier = MultiOutputClassifier(classifier)
    return multioutput_classifier

In [10]:
def train(model,X_train, y_train):
    print('train the model')
    X_train  = X_train_data.drop(columns = ['participant_id'] )
    y_train  = y_train_data.drop(columns = ['participant_id'] )
    model.fit(X_train, y_train)

In [11]:
def predict(model,X_test):
    print('predict with the model')
    X_test_data  = X_test.drop(columns = ['participant_id'] )
    y_pred = model.predict(X_test_data)
    predictions_df = pd.DataFrame(
        y_pred,
        columns=['Predicted_Gender', 'Predicted_ADHD']
    )
    return predictions_df

In [12]:
def calculate_score(y_test,y_pred):
    print('calculate score with prediction vs true values')
    y_test_results  = y_test.drop(columns = ['participant_id'] )
    accuracy = accuracy_score(y_test_results, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
def split_data(X,Y):
    print('split the train and test data')
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train_data, X_test_data, y_train_data, y_test_data

In [13]:
def multi_output_accuracy(y_true, y_pred):
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [14]:
def do_cross_validation(X,Y,model):
    # Perform cross-validation on the training data
    X_train_cv  = X.drop(columns = ['participant_id'] )
    y_train_cv  = Y.drop(columns = ['participant_id'] )
    # Create a scorer using scikit-learn's make_scorer
    multi_output_scorer = make_scorer(multi_output_accuracy)
    cv_scores = cross_val_score(model, X_train_cv, y_train_cv, cv=5, scoring=multi_output_scorer)
    
    # Output the cross-validation results
    print("Cross-validation scores for each fold:", cv_scores)
    print("Mean CV score:", f'Mean Accuracy: {np.mean(cv_scores) * 100:.2f}%')

## Train with old dataset (only for reference)

In [15]:
df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test = load_data_old()
joined_training_data = join_data(df_categorical,df_matrices,df_quant,df_solutions)

load old files


FileNotFoundError: [Errno 2] No such file or directory: 'TRAIN_OLD/TRAIN_CATEGORICAL_METADATA.xlsx'

In [None]:
joined_training_data.head(2)

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381,1,0


In [None]:
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )

In [None]:
X.head(2)

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,-0.03763,-0.072599,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,0.014106,-0.001084,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381


In [None]:
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]

In [None]:
Y.head(2)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0


In [None]:
X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)

split the train and test data


## training with old dataset (only for reference)

In [None]:
classifier = xgboost_classifer()

xgboost_classifer


In [None]:
train(classifier,X_train_data, y_train_data)

train the model


In [None]:
y_pred = predict(classifier,X_test_data)

predict with the model


In [None]:
accuracy = calculate_score(y_test_data,y_pred)

calculate score with prediction vs true values
Accuracy: 56.38%


In [None]:
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)

xgboost_classifer
Cross-validation scores for each fold: [0.81893004 0.78600823 0.69753086 0.66322314 0.32644628]
Mean CV score: 0.6584277114580145


In [None]:
save_model(classifier,'old_data_xgb_basic_nochanges',accuracy,y_test_data,y_pred)

saved the model


In [None]:
#get_model('old_data_xgb_basic_nochanges')

##  ****** training with new dataset ******

## run this step once to load new dataset

In [None]:
df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new,df_categorical_test,df_matrices_test,df_quant_test = load_data_new()

load new files
load test files


In [None]:
df_matrices_new.head(2)

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,0.217546,-0.014549,0.00044,-0.096451,0.454501,0.343916,0.167313,0.607656,0.550623,0.503176


In [None]:
df_categorical_new.head(2)

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,00aIpNTbG5uh,2019,4,1.0,0.0,3.0,21.0,45.0,,
1,00fV0OyyoLfw,2017,1,0.0,9.0,2.0,21.0,0.0,21.0,45.0


In [None]:
df_quant_new.head(2)

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,00aIpNTbG5uh,100.0,13.0,3.0,15.0,44.0,14.0,20.0,27.0,3.0,17.0,4.0,11.0,5.0,8.0,6.0,2.0,9.0,14.274127
1,00fV0OyyoLfw,92.27,14.0,3.0,12.0,35.0,25.0,28.0,30.0,5.0,20.0,4.0,13.0,5.0,8.0,7.0,3.0,8.0,


In [None]:
df_solutions_new.head(2)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0


## train raw data with basic xgb classifier and verify accuracy

In [None]:
joined_training_data = join_data(df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new)
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]
X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)
classifier = xgboost_classifer()
train(classifier,X_train_data, y_train_data)
y_pred = predict(classifier,X_test_data)
accuracy = calculate_score(y_test_data,y_pred)
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)
save_model(classifier,'new_data_xgb_basic_nochanges',accuracy,y_test_data,y_pred)

split the train and test data
xgboost_classifer
train the model


## Train with pca data instead of raw connectome matrices data

In [None]:
# PCA df with index preserved as index

original_index = df_matrices.index

# 1. Standardize the data (excluding the first column)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_matrices)

# 2. Apply PCA
# Start with a smaller number of components for exploration
pca = PCA(n_components=1000)  # Adjust based on your needs
pca_result = pca.fit_transform(scaled_data)

# 3. Analyze explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# 4. Plot the explained variance to help choose number of components
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid(True)
plt.axhline(y=0.8, color='r', linestyle='-', label='80% Variance Threshold')
plt.legend()
plt.show()

# 5. Find number of components for desired variance (e.g., 80%)
n_components_80 = np.argmax(cumulative_variance >= 0.8) + 1
print(f"Number of components needed for 80% variance: {n_components_80}")

# 6. Re-run PCA with the optimal number of components
pca_final = PCA(n_components=n_components_80)
pca_result_final = pca_final.fit_transform(scaled_data)

# 7. Create a DataFrame with the PCA results
pca_df = pd.DataFrame(
    data=pca_result_final,
    columns=[f'PC{i+1}' for i in range(n_components_80)],
    index=original_index
)

# 8. Add the first column back as index
# pca_df[first_col_name] = first_col_values
# pca_df.set_index(first_col_name, inplace=True)

# 9. Now you can use pca_df for your machine learning models
print(f"Original data shape: {df_matrices.shape}")
print(f"Reduced data shape: {pca_df.shape}")

NameError: name 'df_matrices_new' is not defined

In [None]:
## pca calculation
## pca results save to joblib  def save_exploration(model,name):
## pca model load >> load it instead of df_matrices_new [participant_id ]

joined_training_data = join_data(df_categorical_new,pca_df,df_quant_new,df_solutions_new)
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]
X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)

classifier = xgboost_classifer()
train(classifier,X_train_data, y_train_data)
y_pred = predict(classifier,X_test_data)
accuracy = calculate_score(y_test_data,y_pred)
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)
save_model(classifier,'new_data_xgb_basic_pca',accuracy,y_test_data,y_pred)

In [None]:
sub_matrices_df = df_matrices[['96throw_194thcolumn','14throw_15thcolumn','19throw_122thcolumn', '26throw_129thcolumn']]

print(sub_matrices_df)


In [None]:
joined_training_data = join_data(df_categorical,sub_matrices_df,df_quant,df_solutions)
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]
X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)

classifier = xgboost_classifer()
train(classifier,X_train_data, y_train_data)
y_pred = predict(classifier,X_test_data)
accuracy = calculate_score(y_test_data,y_pred)
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)
save_model(classifier,'new_data_xgb_basic_select_matrice_col',accuracy,y_test_data,y_pred)

## quantitivate data modified