In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import joblib
import xlsxwriter
import os
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score

# @changes from inna
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import warnings
warnings.filterwarnings("ignore")

In [39]:
def load_data(folderPathTrain, folderPathTest):
    print('load new files')
    df_categorical = pd.read_excel(f'{folderPathTrain}/TRAIN_CATEGORICAL_METADATA_new.xlsx')
    df_matrices = pd.read_csv(f'{folderPathTrain}/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv',header=[0], index_col=[0])
    df_quant = pd.read_excel(f'{folderPathTrain}/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    df_solutions = pd.read_excel(f'{folderPathTrain}/TRAINING_SOLUTIONS.xlsx')  
    
    print('train categorical data count',df_categorical.shape)
    print('train quantitative data count',df_quant.shape)
    print('train matrices data count',df_matrices.shape)
    print('train solutions data count',df_solutions.shape)
    
    print('load test files')
    df_categorical_test = pd.read_excel(f'{folderPathTest}/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv(f'{folderPathTest}/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv',header=[0], index_col=[0])
    df_quant_test = pd.read_excel(f'{folderPathTest}/TEST_QUANTITATIVE_METADATA.xlsx')

    
    print('test categorical data count',df_categorical_test.shape)
    print('test quantitative data count',df_quant_test.shape)
    print('test matrices data count',df_matrices_test.shape)
    
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [5]:
def join_data(categorical,matrices,quantitative,solutions=None):
    print('joining data frames')
    print('categorical data count',categorical.shape)
    print('quantitative data count',quantitative.shape)
    print('matrices data count',matrices.shape)
    cat_quant = pd.merge(categorical, quantitative , on ='participant_id', how ='inner')
    cat_quant_mat = pd.merge(cat_quant, matrices , on ='participant_id', how ='inner')
    if isinstance(solutions, pd.DataFrame):
        cat_quant_mat_sols = pd.merge(cat_quant_mat, solutions , on ='participant_id', how ='inner')
        print('solutions data count',solutions.shape)
        return cat_quant_mat_sols
    else:    
        return cat_quant_mat

In [6]:
def xgboost_classifer(objective = 'binary:logistic', max_depth=5,learning_rate=0.1,n_estimators=100):
    print('xgboost_classifer')
    # Initialize the base classifier
    classifier = XGBClassifier(objective=objective, \
                               n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    multioutput_classifier = MultiOutputClassifier(classifier)
    return multioutput_classifier

In [7]:
def predict(model,X_test):
    print('predict with the model')
    X_test_data  = X_test.drop(columns = ['participant_id'] )
    y_pred = model.predict(X_test_data)
    predictions_df = pd.DataFrame(
        y_pred,
        columns=['ADHD_Outcome', 'Sex_F']
    )
    return predictions_df

In [8]:
def calculate_score(y_test,y_pred):
    print('calculate score with prediction vs true values')
    y_test_results  = y_test.drop(columns = ['participant_id'] )
    accuracy = accuracy_score(y_test_results, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [9]:
def split_train_data(X,Y):
    print('split the train and test data')
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train_data, X_test_data, y_train_data, y_test_data

In [10]:
def multi_output_accuracy(y_true, y_pred):
    print('multi_output_accuracy')
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [11]:
def do_cross_validation(X,Y,model):
    # Perform cross-validation on the training data
    X_train_cv  = X.drop(columns = ['participant_id'] )
    y_train_cv  = Y.drop(columns = ['participant_id'] )
    # Create a scorer using scikit-learn's make_scorer
    multi_output_scorer = make_scorer(multi_output_accuracy)
    cv_scores = cross_val_score(model, X_train_cv, y_train_cv, cv=5, scoring=multi_output_scorer)
    
    # Output the cross-validation results
    print("Cross-validation scores for each fold:", cv_scores)
    print("Mean CV score:", f'Mean Accuracy: {np.mean(cv_scores) * 100:.2f}%')

In [40]:
def transform_matrices_data(df_matrices_new,n_components = 200):
    print('starting pca analysis')
    print(df_matrices_new.shape)
    # print(df_matrices_new.columns)
    
    # df_matrices_participant_ids = df_matrices_new['participant_id']
    # df_matrices_new = df_matrices_new.drop(columns = ['participant_id'] )
    
    original_index = df_matrices_new.index
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_matrices_new)


    # Start with a smaller number of components for exploration
    pca = PCA(n_components=200)
    pca_result = pca.fit_transform(scaled_data)


    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    n_fixed_components = 200

    # plt.figure(figsize=(10, 6))
    # plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='-')
    # plt.xlabel('Number of Components')
    # plt.ylabel('Cumulative Explained Variance')
    # plt.title('Explained Variance vs. Number of Components')
    # plt.grid(True)
    # plt.axhline(y=0.8, color='r', linestyle='-', label='80% Variance Threshold')
    # plt.legend()
    # plt.show()

    pca_final = PCA(n_components=n_fixed_components)
    pca_result_final = pca_final.fit_transform(scaled_data)

    pca_df = pd.DataFrame(
        data=pca_result_final,
        columns=[f'PC{i+1}' for i in range(n_fixed_components)], 
        index=original_index
    )

    print(f"Original data shape: {df_matrices_new.shape}")
    print(f"Reduced data shape: {pca_df.shape}")

    
    # print(df_matrices_participant_ids.columns)
    
    # pca_df['participant_id'] = df_matrices_participant_ids
    
    return pca_df

In [22]:
def transform_quant_data(df_quant_new):
    print('starting quant data scaling')
    df_quant_scaled_dropped = df_quant_new.drop(columns = ['participant_id'] )
    df_quant_scaled_dropped = pd.DataFrame(df_quant_scaled_dropped)
    scaler = StandardScaler()
    df_quant_scaled = scaler.fit_transform(df_quant_scaled_dropped)
    df_quant_scaled = pd.DataFrame(df_quant_scaled)
    df_quant_scaled['participant_id'] = df_quant_new['participant_id']

    #select specific columns only for classifier
    df_quant_scaled_selected = df_quant_scaled.iloc[:,4:]
    return df_quant_scaled_selected

In [23]:
def transform_categorical_data(df_categorical_new):
    print('starting categorical data encoding')
    # One-Hot Encoding (nominal)
    # encoder with fixed categoried added for fixing unknown data in train vs test
    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    #nominal_cols = ['MRI_Track_Scan_Location', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Race', 'PreInt_Demos_Fam_Child_Ethnicity']
    nominal_cols = ['PreInt_Demos_Fam_Child_Ethnicity']

    
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([df_categorical_new, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)
    
    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3,4,5,6,7,8,9,10,11]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    nominal_cols = ['PreInt_Demos_Fam_Child_Race']
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([encoded_df, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)    

    onehot_encoder = OneHotEncoder(categories = [[0,1,2,3,4]], handle_unknown='ignore', sparse_output=False, drop='first') #drop first to prevent multicollinearity
    nominal_cols = ['MRI_Track_Scan_Location']
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([encoded_df, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)  
    
    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].fillna(encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].fillna(encoded_df['Barratt_Barratt_P2_Edu'].median())
    
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P2_Edu'].median())

    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[ 3, 6, 9, 12, 15, 18, 21],[ 3, 6, 9, 12, 15, 18, 21] ])
    ordinal_cols = ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P2_Edu']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].fillna(encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].fillna(encoded_df['Barratt_Barratt_P2_Occ'].median())
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P2_Occ'].median())
    
    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[0, 5, 10, 15, 20, 25, 30, 35, 40, 45], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]])
    ordinal_cols = ['Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Occ']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    encoded_df_modified = encoded_df.drop(columns = ['Basic_Demos_Enroll_Year'])

    encoded_df_modified.rename(columns={'MRI_Track_Scan_Location_1': 'MRI_Track_Scan_Location_1.0'}, inplace=True)
    encoded_df_modified.rename(columns={'MRI_Track_Scan_Location_2': 'MRI_Track_Scan_Location_2.0'}, inplace=True)
    encoded_df_modified.rename(columns={'MRI_Track_Scan_Location_3': 'MRI_Track_Scan_Location_3.0'}, inplace=True)
    encoded_df_modified.rename(columns={'MRI_Track_Scan_Location_4': 'MRI_Track_Scan_Location_4.0'}, inplace=True)


    return encoded_df_modified

In [24]:
def xgboost_train(X = None, Y =None,max_depth= None,learning_rate = None,n_estimators = None):
    print('starting training')
    print('setting tuning params')
    classifier = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('splitting to test and train')
    X_train_data, X_test_data, y_train_data, y_test_data = split_train_data(X, Y)
    
    print('training the model')
    X_train  = X_train_data.drop(columns = ['participant_id'] )
    y_train  = y_train_data.drop(columns = ['participant_id'] )
    classifier.fit(X_train, y_train)
    
    print('setting cross validation classifier with tuning params')
    classifier_cv = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('start cross validation')
    do_cross_validation(X,Y,classifier_cv)
    
    print('check accuracy')
    y_pred = predict(classifier,X_test_data)
    print('calculate score')
    accuracy = calculate_score(y_test_data,y_pred)
    return classifier

In [25]:
def xgboost_test(classifier = None,X = None):
    print('start testing')
    Y = predict(classifier,X)
    return Y

## train

In [41]:
folderPathTrain, folderPathTest = 'Datafiles/TRAIN_NEW/' , 'Datafiles/TEST/'
df_categorical_train,df_matrices_train,df_quant_train,df_solutions_train,df_categorical_test,df_matrices_test,df_quant_test = load_data(folderPathTrain, folderPathTest)

load new files
train categorical data count (1213, 10)
train quantitative data count (1213, 19)
train matrices data count (1213, 19900)
train solutions data count (1213, 3)
load test files
test categorical data count (304, 10)
test quantitative data count (304, 19)
test matrices data count (304, 19900)


In [64]:
df_categorical_train.describe()

Unnamed: 0,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
count,1213.0,1213.0,1170.0,1159.0,1210.0,1198.0,1182.0,1015.0,991.0
mean,2017.652102,2.014839,0.435897,2.177739,2.294215,17.86227,25.545685,16.876847,30.257316
std,1.122522,1.135147,0.693174,3.204782,0.75066,3.505608,16.757043,3.929558,13.901144
min,2015.0,1.0,0.0,0.0,1.0,3.0,0.0,3.0,0.0
25%,2017.0,1.0,0.0,0.0,2.0,15.0,5.0,15.0,20.0
50%,2018.0,1.0,0.0,1.0,2.0,18.0,30.0,18.0,35.0
75%,2019.0,3.0,1.0,2.0,3.0,21.0,40.0,21.0,40.0
max,2020.0,4.0,3.0,11.0,4.0,21.0,45.0,21.0,45.0


In [35]:
df_matrices_1 = pd.read_csv(f'{folderPathTrain}/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv',header=[0], index_col=[0])

In [42]:
df_matrices_train.head(2)

Unnamed: 0_level_0,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,0throw_10thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,0.470063,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221
WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,0.27635,...,0.217546,-0.014549,0.00044,-0.096451,0.454501,0.343916,0.167313,0.607656,0.550623,0.503176


In [37]:
df_matrices_train.head(2)

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,0.217546,-0.014549,0.00044,-0.096451,0.454501,0.343916,0.167313,0.607656,0.550623,0.503176


In [43]:
pca_df_train = transform_matrices_data(df_matrices_train,n_components = 200)
quant_df_train = transform_quant_data(df_quant_train)
cat_df_train = transform_categorical_data(df_categorical_train)

starting pca analysis
(1213, 19900)
Original data shape: (1213, 19900)
Reduced data shape: (1213, 200)
starting quant data scaling
starting categorical data encoding


In [65]:
cat_df_train.describe()

Unnamed: 0,Basic_Demos_Study_Site,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,PreInt_Demos_Fam_Child_Ethnicity_1.0,PreInt_Demos_Fam_Child_Ethnicity_2.0,PreInt_Demos_Fam_Child_Ethnicity_3.0,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,...,PreInt_Demos_Fam_Child_Race_6.0,PreInt_Demos_Fam_Child_Race_7.0,PreInt_Demos_Fam_Child_Race_8.0,PreInt_Demos_Fam_Child_Race_9.0,PreInt_Demos_Fam_Child_Race_10.0,PreInt_Demos_Fam_Child_Race_11.0,MRI_Track_Scan_Location_1.0,MRI_Track_Scan_Location_2.0,MRI_Track_Scan_Location_3.0,MRI_Track_Scan_Location_4.0
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,...,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0
mean,2.014839,4.954658,6.546579,4.686727,6.600165,0.244023,0.063479,0.016488,0.149217,0.105523,...,0.0,0.001649,0.160758,0.018961,0.009068,0.004946,0.147568,0.438582,0.381698,0.029678
std,1.135147,1.161293,1.710147,1.206062,2.064368,0.429684,0.243923,0.127395,0.356449,0.307353,...,0.0,0.040589,0.36746,0.136444,0.094835,0.070186,0.354818,0.496418,0.486004,0.169769
min,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,4.0,6.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,5.0,6.0,5.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,6.0,8.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,4.0,6.0,9.0,6.0,9.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
print('pca train', pca_df_train.shape)
print('cat train', cat_df_train.shape)
print('quant train', quant_df_train.shape)

pca train (1213, 200)
cat train (1213, 24)
quant train (1213, 15)


In [46]:
# using df_matrices_train instead of pca_df_train
#joined_training_data = join_data(cat_df_train,df_matrices_train,quant_df_train,df_solutions_train)
# reverting to use new pca
joined_training_data = join_data(cat_df_train,pca_df_train,quant_df_train,df_solutions_train)
X = joined_training_data.drop(columns = ['ADHD_Outcome', 'Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome', 'Sex_F']]

joining data frames
categorical data count (1213, 24)
quantitative data count (1213, 15)
matrices data count (1213, 200)
solutions data count (1213, 3)


In [48]:
X.shape

(1213, 238)

In [49]:
Y.count()

participant_id    1213
ADHD_Outcome      1213
Sex_F             1213
dtype: int64

In [50]:
classifier_trained = xgboost_train(X = X, Y =Y,max_depth= 5,learning_rate = 0.1,n_estimators = 100)

starting training
setting tuning params
xgboost_classifer
splitting to test and train
split the train and test data
training the model
setting cross validation classifier with tuning params
xgboost_classifer
start cross validation
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
Cross-validation scores for each fold: [0.72839506 0.76337449 0.72016461 0.74793388 0.72727273]
Mean CV score: Mean Accuracy: 73.74%
check accuracy
predict with the model
calculate score
calculate score with prediction vs true values
Accuracy: 57.20%


## test

In [51]:
pca_df_test = transform_matrices_data(df_matrices_test,n_components = 200)
quant_df_test = transform_quant_data(df_quant_test)
cat_df_test = transform_categorical_data(df_categorical_test)

starting pca analysis
(304, 19900)
Original data shape: (304, 19900)
Reduced data shape: (304, 200)
starting quant data scaling
starting categorical data encoding


In [52]:
print('pca test', pca_df_test.shape)
print('cat test', cat_df_test.shape)
print('quant test', quant_df_test.shape)

pca test (304, 200)
cat test (304, 24)
quant test (304, 15)


In [53]:
cat_df_test.rename(columns={'MRI_Track_Scan_Location_1': 'MRI_Track_Scan_Location_1.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_2': 'MRI_Track_Scan_Location_2.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_3': 'MRI_Track_Scan_Location_3.0'}, inplace=True)
cat_df_test.rename(columns={'MRI_Track_Scan_Location_4': 'MRI_Track_Scan_Location_4.0'}, inplace=True)

In [54]:
# using df_matrices_test instead of pca_df_test
# X_test = join_data(cat_df_test,df_matrices_test,quant_df_test)
# USING new PCA
X_test = join_data(cat_df_test,pca_df_test,quant_df_test)

joining data frames
categorical data count (304, 24)
quantitative data count (304, 15)
matrices data count (304, 200)


In [55]:
X_test.shape

(304, 238)

In [56]:
set1 = set(list(X.columns))
set2 = set(list(X_test.columns))
difference1 = list(set1 - set2)
print(f"Comparison of train vs test cols in list1 but not in list2: {difference1}")

Comparison of train vs test cols in list1 but not in list2: []


In [57]:
Y_pred = xgboost_test(classifier = classifier_trained,X = X_test)

start testing
predict with the model


In [58]:
Y_pred.head(2)

Unnamed: 0,ADHD_Outcome,Sex_F
0,1,0
1,1,0


In [59]:
Y_pred.insert(0, 'participant_id' , df_categorical_test['participant_id'])

In [60]:
Y_pred.head(2)

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,1,0
1,vhGrzmvA3Hjq,1,0


In [61]:
Y_pred.shape

(304, 3)

In [62]:
Y_pred.to_csv('output.csv', index=False)