In [143]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import joblib
import xlsxwriter
import os
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score

# @changes from inna
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import warnings
warnings.filterwarnings("ignore")

In [27]:
def load_data(folderPathTrain, folderPathTest):
    print('load new files')
    df_categorical = pd.read_excel(f'{folderPathTrain}/TRAIN_CATEGORICAL_METADATA_new.xlsx')
    df_matrices = pd.read_csv(f'{folderPathTrain}/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
    df_quant = pd.read_excel(f'{folderPathTrain}/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    df_solutions = pd.read_excel(f'{folderPathTrain}/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel(f'{folderPathTest}/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv(f'{folderPathTest}/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel(f'{folderPathTest}/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [78]:
def join_data(categorical,matrices,quantitative,solutions=None):
    print('joining data frames')
    cat_quant = pd.merge(categorical, quantitative , on ='participant_id', how ='inner')
    cat_quant_mat = pd.merge(cat_quant, matrices , on ='participant_id', how ='inner')
    if isinstance(solutions, pd.DataFrame):
        cat_quant_mat_sols = pd.merge(cat_quant_mat, solutions , on ='participant_id', how ='inner')
        return cat_quant_mat_sols
    else:    
        return cat_quant_mat

In [29]:
def xgboost_classifer(objective = 'binary:logistic', max_depth=5,learning_rate=0.1,n_estimators=100):
    print('xgboost_classifer')
    # Initialize the base classifier
    classifier = XGBClassifier(objective=objective, \
                               n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    multioutput_classifier = MultiOutputClassifier(classifier)
    return multioutput_classifier

In [31]:
def predict(model,X_test):
    print('predict with the model')
    X_test_data  = X_test.drop(columns = ['participant_id'] )
    y_pred = model.predict(X_test_data)
    predictions_df = pd.DataFrame(
        y_pred,
        columns=['Predicted_Gender', 'Predicted_ADHD']
    )
    return predictions_df

In [32]:
def calculate_score(y_test,y_pred):
    print('calculate score with prediction vs true values')
    y_test_results  = y_test.drop(columns = ['participant_id'] )
    accuracy = accuracy_score(y_test_results, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [33]:
def split_train_data(X,Y):
    print('split the train and test data')
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train_data, X_test_data, y_train_data, y_test_data

In [34]:
def multi_output_accuracy(y_true, y_pred):
    print('multi_output_accuracy')
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [35]:
def do_cross_validation(X,Y,model):
    # Perform cross-validation on the training data
    X_train_cv  = X.drop(columns = ['participant_id'] )
    y_train_cv  = Y.drop(columns = ['participant_id'] )
    # Create a scorer using scikit-learn's make_scorer
    multi_output_scorer = make_scorer(multi_output_accuracy)
    cv_scores = cross_val_score(model, X_train_cv, y_train_cv, cv=5, scoring=multi_output_scorer)
    
    # Output the cross-validation results
    print("Cross-validation scores for each fold:", cv_scores)
    print("Mean CV score:", f'Mean Accuracy: {np.mean(cv_scores) * 100:.2f}%')

In [256]:
def transform_matrices_data(df_matrices_new,n_components = 1000):
    print('starting pca analysis')
    print(df_matrices_new.shape)
    df_matrices_for_pca = df_matrices_new.drop(columns = ['participant_id'] )
    # PCA df with index preserved as index
    
    original_index = df_matrices_for_pca.index
    
    # 1. Standardize the data (excluding the first column)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_matrices_for_pca)
    
    # 2. Apply PCA
    # Start with a smaller number of components for exploration
    pca = PCA(n_components=n_components)  # Adjust based on your needs
    pca_result = pca.fit_transform(scaled_data)
    
    # 3. Analyze explained variance
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    # 5. Find number of components for desired variance (e.g., 80%)
    n_components_80 = np.argmax(cumulative_variance >= 0.8) + 1
    print(f"Number of components needed for 80% variance: {n_components_80}")
    
    # 6. Re-run PCA with the optimal number of components
    pca_final = PCA(n_components=n_components_80)
    pca_result_final = pca_final.fit_transform(scaled_data)
    
    # 7. Create a DataFrame with the PCA results
    pca_df = pd.DataFrame(
        data=pca_result_final,
        columns=[f'PC{i+1}' for i in range(n_components_80)],
        index=original_index
    )

    # 8 
    pca_df['participant_id'] = df_matrices_new['participant_id']
    
    return pca_df

In [72]:
def transform_quant_data(df_quant_new):
    print('starting quant data scaling')
    df_quant_scaled_dropped = df_quant_new.drop(columns = ['participant_id'] )
    df_quant_scaled_dropped = pd.DataFrame(df_quant_scaled_dropped)
    scaler = StandardScaler()
    df_quant_scaled = scaler.fit_transform(df_quant_scaled_dropped)
    df_quant_scaled = pd.DataFrame(df_quant_scaled)
    df_quant_scaled['participant_id'] = df_quant_new['participant_id']

    #select specific columns only for classifier
    df_quant_scaled_selected = df_quant_scaled.iloc[:,4:]
    return df_quant_scaled_selected

In [214]:
def transform_categorical_data(df_categorical_new):
    print('starting categorical data encoding')
    # One-Hot Encoding (nominal)
    # encoder with fixed categoried added for fixing unknown data in train vs test
    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    #nominal_cols = ['MRI_Track_Scan_Location', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Race', 'PreInt_Demos_Fam_Child_Ethnicity']
    nominal_cols = ['PreInt_Demos_Fam_Child_Ethnicity']

    
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([df_categorical_new, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)
    
    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3,4,5,6,7,8,9,10,11]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    nominal_cols = ['PreInt_Demos_Fam_Child_Race']
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([encoded_df, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)    

    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3,4]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    nominal_cols = ['MRI_Track_Scan_Location']
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([encoded_df, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)  
    
    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].fillna(encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].fillna(encoded_df['Barratt_Barratt_P2_Edu'].median())
    
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P2_Edu'].median())

    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[ 3, 6, 9, 12, 15, 18, 21],[ 3, 6, 9, 12, 15, 18, 21] ])
    ordinal_cols = ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P2_Edu']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].fillna(encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].fillna(encoded_df['Barratt_Barratt_P2_Occ'].median())
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P2_Occ'].median())
    
    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[0, 5, 10, 15, 20, 25, 30, 35, 40, 45], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]])
    ordinal_cols = ['Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Occ']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    encoded_df_modified = encoded_df.drop(columns = ['Basic_Demos_Enroll_Year'])
    return encoded_df_modified

In [109]:
def xgboost_train(X = None, Y =None,max_depth= None,learning_rate = None,n_estimators = None):
    print('starting training')
    print('setting tuning params')
    classifier = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('splitting to test and train')
    X_train_data, X_test_data, y_train_data, y_test_data = split_train_data(X, Y)
    
    print('training the model')
    X_train  = X_train_data.drop(columns = ['participant_id'] )
    y_train  = y_train_data.drop(columns = ['participant_id'] )
    classifier.fit(X_train, y_train)
    
    print('setting cross validation classifier with tuning params')
    classifier_cv = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('start cross validation')
    do_cross_validation(X,Y,classifier_cv)
    
    print('check accuracy')
    y_pred = predict(classifier,X_test_data)
    print('calculate score')
    accuracy = calculate_score(y_test_data,y_pred)
    return classifier

In [40]:
def xgboost_test(classifier = None,X = None):
    print('start testing')
    Y = predict(classifier,X)
    return Y

## train

In [172]:
folderPathTrain, folderPathTest = 'Datafiles/TRAIN_NEW/' , 'Datafiles/TEST/'
df_categorical_train,df_matrices_train,df_quant_train,df_solutions_train,df_categorical_test,df_matrices_test,df_quant_test = load_data(folderPathTrain, folderPathTest)

load new files
load test files


In [262]:
pca_df_train = transform_matrices_data(df_matrices_train,n_components = 200)
quant_df_train = transform_quant_data(df_quant_train)
cat_df_train = transform_categorical_data(df_categorical_train)

starting pca analysis
(1213, 19901)
Number of components needed for 80% variance: 1
starting quant data scaling
starting categorical data encoding


In [263]:
# using df_matrices_train instead of pca_df_train
joined_training_data = join_data(cat_df_train,df_matrices_train,quant_df_train,df_solutions_train)
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]

joining data frames


In [264]:
X.shape

(1213, 19937)

In [265]:
Y.count()

participant_id    1213
ADHD_Outcome      1213
Sex_F             1213
dtype: int64

In [266]:
classifier_trained = xgboost_train(X = X, Y =Y,max_depth= 5,learning_rate = 0.1,n_estimators = 100)

starting training
setting tuning params
xgboost_classifer
splitting to test and train
split the train and test data
training the model
setting cross validation classifier with tuning params
xgboost_classifer
start cross validation
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
Cross-validation scores for each fold: [0.73045267 0.76131687 0.70576132 0.74586777 0.7231405 ]
Mean CV score: Mean Accuracy: 73.33%
check accuracy
predict with the model
calculate score
calculate score with prediction vs true values
Accuracy: 59.26%


## test

In [284]:
df_matrices_test.shape

(304, 19901)

In [285]:
df_quant_test.shape

(304, 19)

In [286]:
df_categorical_test.shape

(304, 10)

In [267]:
pca_df_test = transform_matrices_data(df_matrices_test,n_components = 200)
quant_df_test = transform_quant_data(df_quant_test)
cat_df_test = transform_categorical_data(df_categorical_test)

starting pca analysis
(304, 19901)
Number of components needed for 80% variance: 162
starting quant data scaling
starting categorical data encoding


In [249]:
#cat_df_test.columns

In [268]:
cat_df_test.rename(columns={'MRI_Track_Scan_Location_1': 'MRI_Track_Scan_Location_1.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_2': 'MRI_Track_Scan_Location_2.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_3': 'MRI_Track_Scan_Location_3.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_4': 'MRI_Track_Scan_Location_4.0'}, inplace=True)

In [269]:
# using df_matrices_new instead of pca_df_test
X_test = join_data(cat_df_test,df_matrices_test,quant_df_test)

joining data frames


In [270]:
#cat_df_train.columns

In [271]:
#cat_df_test.columns

In [272]:
#X_test.head(2)

In [273]:
#X_test.count()

In [274]:
#df_matrices_new.count()

In [275]:
#df_matrices_test.count()

In [276]:
set1 = set(list(X.columns))
set2 = set(list(X_test.columns))

In [277]:
difference1 = list(set1 - set2)
print(f"Elements in list1 but not in list2: {difference1}")

Elements in list1 but not in list2: []


In [278]:
#pca_df_train.count()

In [279]:
#pca_df_test.count()

In [300]:
Y_pred = xgboost_test(classifier = classifier_trained,X = X_test)

start testing
predict with the model


In [301]:
Y_pred.head(2)

Unnamed: 0,Predicted_Gender,Predicted_ADHD
0,1,0
1,0,0


In [302]:
Y_pred.shape

(304, 2)

In [303]:
Y_pred_final = pd.DataFrame()
Y_pred_final.insert(0, 'participant_id' , df_categorical_test['participant_id'])
Y_pred_final.insert(1, 'ADHD_Outcome' , Y_pred['Predicted_ADHD'])
Y_pred_final.insert(2, 'Sex_F' , Y_pred['Predicted_Gender'])

In [305]:
Y_pred_final.shape

(304, 3)

In [304]:
Y_pred_final.head(2)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,0,1
1,vhGrzmvA3Hjq,0,0


In [307]:
Y_pred_final.to_excel('output.xlsx', sheet_name='Sheet1', index=False)