In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import joblib

In [3]:
def load_data_old():
    print('load old files')
    df_categorical = pd.read_excel('TRAIN_OLD/TRAIN_CATEGORICAL_METADATA.xlsx')
    df_matrices = pd.read_csv('TRAIN_OLD/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant = pd.read_excel('TRAIN_OLD/TRAIN_QUANTITATIVE_METADATA.xlsx')
    df_solutions = pd.read_excel('TRAIN_OLD/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel('TEST/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv('TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel('TEST/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [38]:
def load_data_new():
    print('load new files')
    df_categorical = pd.read_excel('TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx')
    df_matrices = pd.read_csv('TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
    df_quant = pd.read_excel('TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    df_solutions = pd.read_excel('TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel('TEST/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv('TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel('TEST/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [5]:
def join_data(categorical,matrices,quantitative,solutions=None):
    print('joining data frames')
    cat_quant = pd.merge(categorical, quantitative , on ='participant_id', how ='inner')
    cat_quant_mat = pd.merge(cat_quant, matrices , on ='participant_id', how ='inner')
    if isinstance(solutions, pd.DataFrame):
        cat_quant_mat_sols = pd.merge(cat_quant_mat, solutions , on ='participant_id', how ='inner')
        return cat_quant_mat_sols
    else:    
        return cat_quant_mat

In [34]:
def save_model(model,name,accuracy,y_test,y_pred):
    joblib.dump({'model': model, 'y_test': y_test,'y_pred': y_pred, 'accuracy': accuracy}, name)
    print('saved the model')

def save_exploration(model,name):
    joblib.dump({'model': model}, name)
    print('save_exploration')

def get_model(name):
    model = joblib.load(name)
    print(model)
    return model

In [35]:
def loaddata_joinframes_splittestandtrain(df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new):
    joined_training_data = join_data(df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new)
    X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
    Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]
    X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)
    return X_train_data, X_test_data, y_train_data, y_test_data

### Reference for parts of below sections taken from datathon slides
### https://colab.research.google.com/drive/1texL3JnRdTHyevP3_GzousIFKjTj0LmY#scrollTo=IGFYIQSmlUMb

In [14]:
def xgboost_classifer():
    print('xgboost_classifer')
    # Initialize the base classifier
    classifier = XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5)
    multioutput_classifier = MultiOutputClassifier(classifier)
    return multioutput_classifier

In [15]:
def train(model,X_train, y_train):
    print('train the model')
    X_train  = X_train_data.drop(columns = ['participant_id'] )
    y_train  = y_train_data.drop(columns = ['participant_id'] )
    model.fit(X_train, y_train)

In [16]:
def predict(model,X_test):
    print('predict with the model')
    X_test_data  = X_test.drop(columns = ['participant_id'] )
    y_pred = model.predict(X_test_data)
    predictions_df = pd.DataFrame(
        y_pred,
        columns=['Predicted_Gender', 'Predicted_ADHD']
    )
    return predictions_df

In [17]:
def calculate_score(y_test,y_pred):
    print('calculate score with prediction vs true values')
    y_test_results  = y_test.drop(columns = ['participant_id'] )
    accuracy = accuracy_score(y_test_results, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [18]:
def split_data(X,Y):
    print('split the train and test data')
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train_data, X_test_data, y_train_data, y_test_data

In [31]:
def multi_output_accuracy(y_true, y_pred):
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [32]:
def do_cross_validation(X,Y,model):
    # Perform cross-validation on the training data
    X_train_cv  = X.drop(columns = ['participant_id'] )
    y_train_cv  = Y.drop(columns = ['participant_id'] )
    # Create a scorer using scikit-learn's make_scorer
    multi_output_scorer = make_scorer(multi_output_accuracy)
    cv_scores = cross_val_score(model, X_train_cv, y_train_cv, cv=5, scoring=multi_output_scorer)
    
    # Output the cross-validation results
    print("Cross-validation scores for each fold:", cv_scores)
    print("Mean CV score:", np.mean(cv_scores))

## start

In [7]:
df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test = load_data_old()
joined_training_data = join_data(df_categorical,df_matrices,df_quant,df_solutions)

In [8]:
joined_training_data.head(2)

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381,1,0


In [9]:
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )

In [10]:
X.head(2)

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,-0.03763,-0.072599,-0.058396,-0.041544,0.142806,-0.006377,0.108005,0.148327,0.09323,-0.004984
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,0.014106,-0.001084,-0.025624,-0.031863,0.162011,0.067439,0.017155,0.088893,0.064094,0.194381


In [11]:
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]

In [12]:
Y.head(2)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0


In [19]:
X_train_data, X_test_data, y_train_data, y_test_data = split_data(X, Y)

split the train and test data


## training

In [20]:
classifier = xgboost_classifer()

xgboost_classifer


In [21]:
train(classifier,X_train_data, y_train_data)

train the model


In [22]:
y_pred = predict(classifier,X_test_data)

predict with the model


In [24]:
accuracy = calculate_score(y_test_data,y_pred)

calculate score with prediction vs true values
Accuracy: 56.38%


In [33]:
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)

xgboost_classifer
Cross-validation scores for each fold: [0.81893004 0.78600823 0.69753086 0.66322314 0.32644628]
Mean CV score: 0.6584277114580145


In [25]:
save_model(classifier,'old_data_xgb_basic_nochanges',accuracy,y_test_data,y_pred)

saved the model


In [27]:
#get_model('old_data_xgb_basic_nochanges')

## training with new dataset

In [None]:
df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new,df_categorical_test,df_matrices_test,df_quant_test = load_data_new()

X_train_data, X_test_data, y_train_data, y_test_data = \
    loaddata_joinframes_splittestandtrain (df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new)

classifier = xgboost_classifer()
train(classifier,X_train_data, y_train_data)
y_pred = predict(classifier,X_test_data)
accuracy = calculate_score(y_test_data,y_pred)
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)
save_model(classifier,'new_data_xgb_basic_nochanges',accuracy,y_test_data,y_pred)

load new files
load test files
split the train and test data
xgboost_classifer
train the model


## PCA

In [None]:
df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new,df_categorical_test,df_matrices_test,df_quant_test = load_data_new()

## pca calculation
## pca results save to joblib  def save_exploration(model,name):
## pca model load >> df_matrices_new [participant_id ]

X_train_data, X_test_data, y_train_data, y_test_data = \
    loaddata_joinframes_splittestandtrain (df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new)

classifier = xgboost_classifer()
train(classifier,X_train_data, y_train_data)
y_pred = predict(classifier,X_test_data)
accuracy = calculate_score(y_test_data,y_pred)
classifier_cv = xgboost_classifer()
do_cross_validation(X,Y,classifier_cv)
save_model(classifier,'new_data_xgb_basic_pca',accuracy,y_test_data,y_pred)

## quantitivate data modified