In [26]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import numpy as np
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import joblib
import xlsxwriter
import os
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score

# @changes from inna
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [27]:
def load_data(folderPathTrain, folderPathTest):
    print('load new files')
    df_categorical = pd.read_excel(f'{folderPathTrain}/TRAIN_CATEGORICAL_METADATA_new.xlsx')
    df_matrices = pd.read_csv(f'{folderPathTrain}/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
    df_quant = pd.read_excel(f'{folderPathTrain}/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    df_solutions = pd.read_excel(f'{folderPathTrain}/TRAINING_SOLUTIONS.xlsx')  
    print('load test files')
    df_categorical_test = pd.read_excel(f'{folderPathTest}/TEST_CATEGORICAL.xlsx')
    df_matrices_test = pd.read_csv(f'{folderPathTest}/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
    df_quant_test = pd.read_excel(f'{folderPathTest}/TEST_QUANTITATIVE_METADATA.xlsx')
    return df_categorical,df_matrices,df_quant,df_solutions,df_categorical_test,df_matrices_test,df_quant_test

In [78]:
def join_data(categorical,matrices,quantitative,solutions=None):
    print('joining data frames')
    cat_quant = pd.merge(categorical, quantitative , on ='participant_id', how ='inner')
    cat_quant_mat = pd.merge(cat_quant, matrices , on ='participant_id', how ='inner')
    if isinstance(solutions, pd.DataFrame):
        cat_quant_mat_sols = pd.merge(cat_quant_mat, solutions , on ='participant_id', how ='inner')
        return cat_quant_mat_sols
    else:    
        return cat_quant_mat

In [29]:
def xgboost_classifer(objective = 'binary:logistic', max_depth=5,learning_rate=0.1,n_estimators=100):
    print('xgboost_classifer')
    # Initialize the base classifier
    classifier = XGBClassifier(objective=objective, \
                               n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    multioutput_classifier = MultiOutputClassifier(classifier)
    return multioutput_classifier

In [31]:
def predict(model,X_test):
    print('predict with the model')
    X_test_data  = X_test.drop(columns = ['participant_id'] )
    y_pred = model.predict(X_test_data)
    predictions_df = pd.DataFrame(
        y_pred,
        columns=['Predicted_Gender', 'Predicted_ADHD']
    )
    return predictions_df

In [32]:
def calculate_score(y_test,y_pred):
    print('calculate score with prediction vs true values')
    y_test_results  = y_test.drop(columns = ['participant_id'] )
    accuracy = accuracy_score(y_test_results, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [33]:
def split_train_data(X,Y):
    print('split the train and test data')
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train_data, X_test_data, y_train_data, y_test_data

In [34]:
def multi_output_accuracy(y_true, y_pred):
    print('multi_output_accuracy')
    # Ensure y_true and y_pred are NumPy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Compute accuracy for each target variable and return the mean
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

In [35]:
def do_cross_validation(X,Y,model):
    # Perform cross-validation on the training data
    X_train_cv  = X.drop(columns = ['participant_id'] )
    y_train_cv  = Y.drop(columns = ['participant_id'] )
    # Create a scorer using scikit-learn's make_scorer
    multi_output_scorer = make_scorer(multi_output_accuracy)
    cv_scores = cross_val_score(model, X_train_cv, y_train_cv, cv=5, scoring=multi_output_scorer)
    
    # Output the cross-validation results
    print("Cross-validation scores for each fold:", cv_scores)
    print("Mean CV score:", f'Mean Accuracy: {np.mean(cv_scores) * 100:.2f}%')

In [95]:
def transform_matrices_data(df_matrices_new,n_components = 1000):
    print('starting pca analysis')
    df_matrices_for_pca = df_matrices_new.drop(columns = ['participant_id'] )
    # PCA df with index preserved as index
    
    original_index = df_matrices_for_pca.index
    
    # 1. Standardize the data (excluding the first column)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_matrices_for_pca)
    
    # 2. Apply PCA
    # Start with a smaller number of components for exploration
    pca = PCA(n_components=n_components)  # Adjust based on your needs
    pca_result = pca.fit_transform(scaled_data)
    
    # 3. Analyze explained variance
    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)
    
    # 5. Find number of components for desired variance (e.g., 80%)
    n_components_80 = np.argmax(cumulative_variance >= 0.8) + 1
    print(f"Number of components needed for 80% variance: {n_components_80}")
    
    # 6. Re-run PCA with the optimal number of components
    pca_final = PCA(n_components=n_components_80)
    pca_result_final = pca_final.fit_transform(scaled_data)
    
    # 7. Create a DataFrame with the PCA results
    pca_df = pd.DataFrame(
        data=pca_result_final,
        columns=[f'PC{i+1}' for i in range(n_components_80)],
        index=original_index
    )

    # 8 
    pca_df['participant_id'] = df_matrices_new['participant_id']
    
    return pca_df

In [72]:
def transform_quant_data(df_quant_new):
    print('starting quant data scaling')
    df_quant_scaled_dropped = df_quant_new.drop(columns = ['participant_id'] )
    df_quant_scaled_dropped = pd.DataFrame(df_quant_scaled_dropped)
    scaler = StandardScaler()
    df_quant_scaled = scaler.fit_transform(df_quant_scaled_dropped)
    df_quant_scaled = pd.DataFrame(df_quant_scaled)
    df_quant_scaled['participant_id'] = df_quant_new['participant_id']

    #select specific columns only for classifier
    df_quant_scaled_selected = df_quant_scaled.iloc[:,4:]
    return df_quant_scaled_selected

In [38]:
def transform_categorical_data(df_categorical_new):
    print('starting categorical data encoding')
    # One-Hot Encoding (nominal)
    onehot_encoder = OneHotEncoder(sparse_output=False, drop='first') #drop first to prevent multicollinearity
    nominal_cols = ['MRI_Track_Scan_Location', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Race', 'PreInt_Demos_Fam_Child_Ethnicity']
    onehot_encoded = onehot_encoder.fit_transform(df_categorical_new[nominal_cols])
    onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_cols))
    encoded_df = pd.concat([df_categorical_new, onehot_df], axis=1)
    encoded_df = encoded_df.drop(nominal_cols, axis=1)

    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].fillna(encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].fillna(encoded_df['Barratt_Barratt_P2_Edu'].median())
    
    encoded_df['Barratt_Barratt_P1_Edu'] = encoded_df['Barratt_Barratt_P1_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P1_Edu'].median())
    encoded_df['Barratt_Barratt_P2_Edu'] = encoded_df['Barratt_Barratt_P2_Edu'].replace(0.0, encoded_df['Barratt_Barratt_P2_Edu'].median())

    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[ 3, 6, 9, 12, 15, 18, 21],[ 3, 6, 9, 12, 15, 18, 21] ])
    ordinal_cols = ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P2_Edu']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    # Handle NaN and 0.0 values (imputation example)
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].fillna(encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].fillna(encoded_df['Barratt_Barratt_P2_Occ'].median())
    encoded_df['Barratt_Barratt_P1_Occ'] = encoded_df['Barratt_Barratt_P1_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P1_Occ'].median())
    encoded_df['Barratt_Barratt_P2_Occ'] = encoded_df['Barratt_Barratt_P2_Occ'].replace(0.0, encoded_df['Barratt_Barratt_P2_Occ'].median())
    
    # Ordinal Encoding (ordinal)
    ordinal_encoder = OrdinalEncoder(categories=[[0, 5, 10, 15, 20, 25, 30, 35, 40, 45], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]])
    ordinal_cols = ['Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Occ']
    encoded_df[ordinal_cols] = ordinal_encoder.fit_transform(encoded_df[ordinal_cols])

    encoded_df_modified = encoded_df.drop(columns = ['Basic_Demos_Enroll_Year'])
    return encoded_df_modified

In [47]:
def xgboost_train(X = None, Y =None,max_depth= None,learning_rate = None,n_estimators = None):
    print('starting training')
    print('setting tuning params')
    classifier = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('splitting to test and train')
    X_train_data, X_test_data, y_train_data, y_test_data = split_train_data(X, Y)
    
    print('training the model')
    X_train  = X_train_data.drop(columns = ['participant_id'] )
    y_train  = y_train_data.drop(columns = ['participant_id'] )
    classifier.fit(X_train, y_train)
    
    print('setting cross validation classifier with tuning params')
    classifier_cv = xgboost_classifer(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators)
    print('start cross validation')
    do_cross_validation(X,Y,classifier_cv)
    
    print('check accuracy')
    y_pred = predict(classifier,X_test_data)
    print('calculate score')
    accuracy = calculate_score(y_test_data,y_pred)
    return classifier

In [40]:
def xgboost_test(classifier = None,X = None):
    print('start testing')
    Y = predict(classifier,X)
    return Y

## train

In [41]:
folderPathTrain, folderPathTest = 'Datafiles/TRAIN_NEW/' , 'Datafiles/TEST/'
df_categorical_new,df_matrices_new,df_quant_new,df_solutions_new,df_categorical_test,df_matrices_test,df_quant_test = load_data(folderPathTrain, folderPathTest)

load new files
load test files


In [73]:
pca_df_train = transform_matrices_data(df_matrices_new)
quant_df_train = transform_quant_data(df_quant_new)
cat_df_train = transform_categorical_data(df_categorical_new)

starting pca analysis
Number of components needed for 80% variance: 464
starting quant data scaling
starting categorical data encoding


In [79]:
joined_training_data = join_data(cat_df_train,pca_df_train,quant_df_train,df_solutions_new)
X = joined_training_data.drop(columns = ['ADHD_Outcome','Sex_F'] )
Y = joined_training_data[['participant_id','ADHD_Outcome','Sex_F']]

joining data frames


In [81]:
X.shape

(1213, 504)

In [88]:
Y.count()

participant_id    1213
ADHD_Outcome      1213
Sex_F             1213
dtype: int64

In [89]:
classifier_trained = xgboost_train(X = X, Y =Y,max_depth= 5,learning_rate = 0.1,n_estimators = 100)

starting training
setting tuning params
xgboost_classifer
splitting to test and train
split the train and test data
training the model
setting cross validation classifier with tuning params
xgboost_classifer
start cross validation
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
multi_output_accuracy
Cross-validation scores for each fold: [0.72222222 0.75925926 0.69958848 0.75826446 0.71694215]
Mean CV score: Mean Accuracy: 73.13%
check accuracy
predict with the model
calculate score
calculate score with prediction vs true values
Accuracy: 58.44%


## test

In [91]:
df_matrices_test.count()

participant_id          304
0throw_1thcolumn        304
0throw_2thcolumn        304
0throw_3thcolumn        304
0throw_4thcolumn        304
                       ... 
196throw_198thcolumn    304
196throw_199thcolumn    304
197throw_198thcolumn    304
197throw_199thcolumn    304
198throw_199thcolumn    304
Length: 19901, dtype: int64

In [92]:
df_quant_test.count()

participant_id                304
EHQ_EHQ_Total                 303
ColorVision_CV_Score          295
APQ_P_APQ_P_CP                289
APQ_P_APQ_P_ID                289
APQ_P_APQ_P_INV               289
APQ_P_APQ_P_OPD               289
APQ_P_APQ_P_PM                289
APQ_P_APQ_P_PP                289
SDQ_SDQ_Conduct_Problems      274
SDQ_SDQ_Difficulties_Total    274
SDQ_SDQ_Emotional_Problems    274
SDQ_SDQ_Externalizing         274
SDQ_SDQ_Generating_Impact     274
SDQ_SDQ_Hyperactivity         274
SDQ_SDQ_Internalizing         274
SDQ_SDQ_Peer_Problems         274
SDQ_SDQ_Prosocial             274
MRI_Track_Age_at_Scan         304
dtype: int64

In [93]:
df_categorical_test.count()

participant_id                      304
Basic_Demos_Enroll_Year             304
Basic_Demos_Study_Site              304
PreInt_Demos_Fam_Child_Ethnicity    301
PreInt_Demos_Fam_Child_Race         298
MRI_Track_Scan_Location             304
Barratt_Barratt_P1_Edu              303
Barratt_Barratt_P1_Occ              303
Barratt_Barratt_P2_Edu              268
Barratt_Barratt_P2_Occ              262
dtype: int64

In [96]:
pca_df_test = transform_matrices_data(df_matrices_test,n_components = 200)
quant_df_test = transform_quant_data(df_quant_test)
cat_df_test = transform_categorical_data(df_categorical_test)

starting pca analysis
Number of components needed for 80% variance: 162
starting quant data scaling
starting categorical data encoding


In [98]:
X_test = join_data(cat_df_test,pca_df_test,quant_df_test)

joining data frames


In [101]:
cat_df_train.columns

Index(['participant_id', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
       'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ',
       'MRI_Track_Scan_Location_2.0', 'MRI_Track_Scan_Location_3.0',
       'MRI_Track_Scan_Location_4.0', 'MRI_Track_Scan_Location_nan',
       'Basic_Demos_Study_Site_2', 'Basic_Demos_Study_Site_3',
       'Basic_Demos_Study_Site_4', 'PreInt_Demos_Fam_Child_Race_1.0',
       'PreInt_Demos_Fam_Child_Race_2.0', 'PreInt_Demos_Fam_Child_Race_3.0',
       'PreInt_Demos_Fam_Child_Race_4.0', 'PreInt_Demos_Fam_Child_Race_7.0',
       'PreInt_Demos_Fam_Child_Race_8.0', 'PreInt_Demos_Fam_Child_Race_9.0',
       'PreInt_Demos_Fam_Child_Race_10.0', 'PreInt_Demos_Fam_Child_Race_11.0',
       'PreInt_Demos_Fam_Child_Race_nan',
       'PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan'],
      dtype='object')

In [102]:
cat_df_test.columns

Index(['participant_id', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
       'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ',
       'MRI_Track_Scan_Location_4', 'Basic_Demos_Study_Site_5',
       'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0',
       'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0',
       'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0',
       'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_11.0',
       'PreInt_Demos_Fam_Child_Race_nan',
       'PreInt_Demos_Fam_Child_Ethnicity_1.0',
       'PreInt_Demos_Fam_Child_Ethnicity_2.0',
       'PreInt_Demos_Fam_Child_Ethnicity_3.0',
       'PreInt_Demos_Fam_Child_Ethnicity_nan'],
      dtype='object')

In [99]:
X_test.head(2)

Unnamed: 0,participant_id,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,MRI_Track_Scan_Location_4,Basic_Demos_Study_Site_5,PreInt_Demos_Fam_Child_Race_1.0,PreInt_Demos_Fam_Child_Race_2.0,PreInt_Demos_Fam_Child_Race_3.0,...,PC153,PC154,PC155,PC156,PC157,PC158,PC159,PC160,PC161,PC162
0,Cfwaf5FX7jWK,6.0,6.0,5.0,6.0,1.0,0.0,0.0,0.0,0.0,...,4.36106,-1.470873,3.497982,-0.968178,14.226648,-4.248896,1.594867,8.371786,-0.43767,1.445551
1,vhGrzmvA3Hjq,6.0,9.0,5.0,6.0,1.0,0.0,0.0,0.0,0.0,...,-3.249966,14.663355,-11.107535,-1.141988,-8.864034,6.600151,-4.32654,5.802841,-10.878062,-3.332385


In [103]:
X_test.count()

participant_id            304
Barratt_Barratt_P1_Edu    304
Barratt_Barratt_P1_Occ    304
Barratt_Barratt_P2_Edu    304
Barratt_Barratt_P2_Occ    304
                         ... 
PC158                     304
PC159                     304
PC160                     304
PC161                     304
PC162                     304
Length: 196, dtype: int64

In [105]:
xgboost_test(classifier = classifier_trained,X = X_test)

start testing
predict with the model


ValueError: feature_names mismatch: ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ', 'MRI_Track_Scan_Location_2.0', 'MRI_Track_Scan_Location_3.0', 'MRI_Track_Scan_Location_4.0', 'MRI_Track_Scan_Location_nan', 'Basic_Demos_Study_Site_2', 'Basic_Demos_Study_Site_3', 'Basic_Demos_Study_Site_4', 'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0', 'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0', 'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0', 'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_10.0', 'PreInt_Demos_Fam_Child_Race_11.0', 'PreInt_Demos_Fam_Child_Race_nan', 'PreInt_Demos_Fam_Child_Ethnicity_1.0', 'PreInt_Demos_Fam_Child_Ethnicity_2.0', 'PreInt_Demos_Fam_Child_Ethnicity_3.0', 'PreInt_Demos_Fam_Child_Ethnicity_nan', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'PC71', 'PC72', 'PC73', 'PC74', 'PC75', 'PC76', 'PC77', 'PC78', 'PC79', 'PC80', 'PC81', 'PC82', 'PC83', 'PC84', 'PC85', 'PC86', 'PC87', 'PC88', 'PC89', 'PC90', 'PC91', 'PC92', 'PC93', 'PC94', 'PC95', 'PC96', 'PC97', 'PC98', 'PC99', 'PC100', 'PC101', 'PC102', 'PC103', 'PC104', 'PC105', 'PC106', 'PC107', 'PC108', 'PC109', 'PC110', 'PC111', 'PC112', 'PC113', 'PC114', 'PC115', 'PC116', 'PC117', 'PC118', 'PC119', 'PC120', 'PC121', 'PC122', 'PC123', 'PC124', 'PC125', 'PC126', 'PC127', 'PC128', 'PC129', 'PC130', 'PC131', 'PC132', 'PC133', 'PC134', 'PC135', 'PC136', 'PC137', 'PC138', 'PC139', 'PC140', 'PC141', 'PC142', 'PC143', 'PC144', 'PC145', 'PC146', 'PC147', 'PC148', 'PC149', 'PC150', 'PC151', 'PC152', 'PC153', 'PC154', 'PC155', 'PC156', 'PC157', 'PC158', 'PC159', 'PC160', 'PC161', 'PC162', 'PC163', 'PC164', 'PC165', 'PC166', 'PC167', 'PC168', 'PC169', 'PC170', 'PC171', 'PC172', 'PC173', 'PC174', 'PC175', 'PC176', 'PC177', 'PC178', 'PC179', 'PC180', 'PC181', 'PC182', 'PC183', 'PC184', 'PC185', 'PC186', 'PC187', 'PC188', 'PC189', 'PC190', 'PC191', 'PC192', 'PC193', 'PC194', 'PC195', 'PC196', 'PC197', 'PC198', 'PC199', 'PC200', 'PC201', 'PC202', 'PC203', 'PC204', 'PC205', 'PC206', 'PC207', 'PC208', 'PC209', 'PC210', 'PC211', 'PC212', 'PC213', 'PC214', 'PC215', 'PC216', 'PC217', 'PC218', 'PC219', 'PC220', 'PC221', 'PC222', 'PC223', 'PC224', 'PC225', 'PC226', 'PC227', 'PC228', 'PC229', 'PC230', 'PC231', 'PC232', 'PC233', 'PC234', 'PC235', 'PC236', 'PC237', 'PC238', 'PC239', 'PC240', 'PC241', 'PC242', 'PC243', 'PC244', 'PC245', 'PC246', 'PC247', 'PC248', 'PC249', 'PC250', 'PC251', 'PC252', 'PC253', 'PC254', 'PC255', 'PC256', 'PC257', 'PC258', 'PC259', 'PC260', 'PC261', 'PC262', 'PC263', 'PC264', 'PC265', 'PC266', 'PC267', 'PC268', 'PC269', 'PC270', 'PC271', 'PC272', 'PC273', 'PC274', 'PC275', 'PC276', 'PC277', 'PC278', 'PC279', 'PC280', 'PC281', 'PC282', 'PC283', 'PC284', 'PC285', 'PC286', 'PC287', 'PC288', 'PC289', 'PC290', 'PC291', 'PC292', 'PC293', 'PC294', 'PC295', 'PC296', 'PC297', 'PC298', 'PC299', 'PC300', 'PC301', 'PC302', 'PC303', 'PC304', 'PC305', 'PC306', 'PC307', 'PC308', 'PC309', 'PC310', 'PC311', 'PC312', 'PC313', 'PC314', 'PC315', 'PC316', 'PC317', 'PC318', 'PC319', 'PC320', 'PC321', 'PC322', 'PC323', 'PC324', 'PC325', 'PC326', 'PC327', 'PC328', 'PC329', 'PC330', 'PC331', 'PC332', 'PC333', 'PC334', 'PC335', 'PC336', 'PC337', 'PC338', 'PC339', 'PC340', 'PC341', 'PC342', 'PC343', 'PC344', 'PC345', 'PC346', 'PC347', 'PC348', 'PC349', 'PC350', 'PC351', 'PC352', 'PC353', 'PC354', 'PC355', 'PC356', 'PC357', 'PC358', 'PC359', 'PC360', 'PC361', 'PC362', 'PC363', 'PC364', 'PC365', 'PC366', 'PC367', 'PC368', 'PC369', 'PC370', 'PC371', 'PC372', 'PC373', 'PC374', 'PC375', 'PC376', 'PC377', 'PC378', 'PC379', 'PC380', 'PC381', 'PC382', 'PC383', 'PC384', 'PC385', 'PC386', 'PC387', 'PC388', 'PC389', 'PC390', 'PC391', 'PC392', 'PC393', 'PC394', 'PC395', 'PC396', 'PC397', 'PC398', 'PC399', 'PC400', 'PC401', 'PC402', 'PC403', 'PC404', 'PC405', 'PC406', 'PC407', 'PC408', 'PC409', 'PC410', 'PC411', 'PC412', 'PC413', 'PC414', 'PC415', 'PC416', 'PC417', 'PC418', 'PC419', 'PC420', 'PC421', 'PC422', 'PC423', 'PC424', 'PC425', 'PC426', 'PC427', 'PC428', 'PC429', 'PC430', 'PC431', 'PC432', 'PC433', 'PC434', 'PC435', 'PC436', 'PC437', 'PC438', 'PC439', 'PC440', 'PC441', 'PC442', 'PC443', 'PC444', 'PC445', 'PC446', 'PC447', 'PC448', 'PC449', 'PC450', 'PC451', 'PC452', 'PC453', 'PC454', 'PC455', 'PC456', 'PC457', 'PC458', 'PC459', 'PC460', 'PC461', 'PC462', 'PC463', 'PC464'] ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ', 'MRI_Track_Scan_Location_4', 'Basic_Demos_Study_Site_5', 'PreInt_Demos_Fam_Child_Race_1.0', 'PreInt_Demos_Fam_Child_Race_2.0', 'PreInt_Demos_Fam_Child_Race_3.0', 'PreInt_Demos_Fam_Child_Race_4.0', 'PreInt_Demos_Fam_Child_Race_7.0', 'PreInt_Demos_Fam_Child_Race_8.0', 'PreInt_Demos_Fam_Child_Race_9.0', 'PreInt_Demos_Fam_Child_Race_11.0', 'PreInt_Demos_Fam_Child_Race_nan', 'PreInt_Demos_Fam_Child_Ethnicity_1.0', 'PreInt_Demos_Fam_Child_Ethnicity_2.0', 'PreInt_Demos_Fam_Child_Ethnicity_3.0', 'PreInt_Demos_Fam_Child_Ethnicity_nan', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'PC71', 'PC72', 'PC73', 'PC74', 'PC75', 'PC76', 'PC77', 'PC78', 'PC79', 'PC80', 'PC81', 'PC82', 'PC83', 'PC84', 'PC85', 'PC86', 'PC87', 'PC88', 'PC89', 'PC90', 'PC91', 'PC92', 'PC93', 'PC94', 'PC95', 'PC96', 'PC97', 'PC98', 'PC99', 'PC100', 'PC101', 'PC102', 'PC103', 'PC104', 'PC105', 'PC106', 'PC107', 'PC108', 'PC109', 'PC110', 'PC111', 'PC112', 'PC113', 'PC114', 'PC115', 'PC116', 'PC117', 'PC118', 'PC119', 'PC120', 'PC121', 'PC122', 'PC123', 'PC124', 'PC125', 'PC126', 'PC127', 'PC128', 'PC129', 'PC130', 'PC131', 'PC132', 'PC133', 'PC134', 'PC135', 'PC136', 'PC137', 'PC138', 'PC139', 'PC140', 'PC141', 'PC142', 'PC143', 'PC144', 'PC145', 'PC146', 'PC147', 'PC148', 'PC149', 'PC150', 'PC151', 'PC152', 'PC153', 'PC154', 'PC155', 'PC156', 'PC157', 'PC158', 'PC159', 'PC160', 'PC161', 'PC162']
expected PC291, PC187, PC306, MRI_Track_Scan_Location_4.0, PC186, PC248, PC259, PC246, PC197, PC435, PC373, PC304, PC404, PC270, PC312, PC379, PC361, PC393, PC345, PC434, PC211, PC198, PC414, PC381, PC464, PC265, PC334, PC438, PC463, PC325, PC430, PC289, PC300, PC459, PC451, PC324, PC380, PC346, PC167, PC191, PC398, PC242, PC169, PC320, PC204, PC275, PC341, PC200, PC253, PC433, PC262, Basic_Demos_Study_Site_4, PC372, PC188, PC256, PC303, PC422, PC338, PC412, PC295, PC362, PC458, PC301, PC311, PC282, PC368, PC456, PC382, PC209, PC420, PC170, PC194, PC443, PC316, PC205, PC297, PC330, PC449, PC390, PC360, PC278, PC192, PC230, PC457, PC225, PC171, PC228, Basic_Demos_Study_Site_2, PC214, PC337, PC288, PC442, PC279, PC377, PC224, PC431, PC196, PC173, PC222, PC437, PC402, PC409, PC355, PC240, PC395, PC203, PC370, PC183, PC215, PC445, PC329, PC235, PC353, PC453, PC314, PC285, PC182, PC417, PC231, PC365, PC429, PC206, PC274, PC273, PC400, PC251, PC333, PC213, PC227, PC375, PC344, PC268, PC210, PC331, PC318, PC315, PC276, PC233, PC328, PC349, PC195, PC175, PC166, PC356, PC221, PC369, MRI_Track_Scan_Location_3.0, PC207, PC234, PC179, PC418, PC343, PC460, PC359, PC441, PC386, PC351, PC322, PC243, PC263, PC352, PC226, PC461, PC439, PC371, PC326, PC428, PC220, PC238, PC247, PC335, PC302, PC339, PC413, PC452, PC313, PC244, PC432, PC384, PC387, PC174, PC332, PC426, PC212, PC252, PC280, PC165, PC350, PC411, PC354, PC208, PC410, PC319, PC317, PC425, PC266, PC239, PC385, PC255, PC440, PC184, PC423, PC396, PC261, PC181, PC218, PC378, PC364, PC249, PC232, PC293, MRI_Track_Scan_Location_2.0, PC454, PC397, PC164, PC327, PC340, PC283, PC336, PC424, PC347, PC436, PC348, PC450, Basic_Demos_Study_Site_3, PC202, PC245, PC366, PC416, PC383, PC462, PC219, PC415, PC176, PC163, PC236, PC323, PC272, PC405, PC308, PreInt_Demos_Fam_Child_Race_10.0, PC391, PC298, PC185, PC216, PC180, PC419, PC407, PC172, PC190, PC199, PC269, PC399, MRI_Track_Scan_Location_nan, PC241, PC305, PC403, PC342, PC254, PC296, PC427, PC358, PC193, PC367, PC310, PC376, PC257, PC447, PC290, PC292, PC307, PC299, PC258, PC444, PC284, PC237, PC267, PC374, PC363, PC446, PC217, PC264, PC168, PC271, PC260, PC250, PC277, PC455, PC229, PC448, PC294, PC189, PC389, PC406, PC281, PC201, PC287, PC401, PC408, PC223, PC357, PC388, PC286, PC177, PC321, PC394, PC421, PC309, PC392, PC178 in input data
training data did not have the following fields: Basic_Demos_Study_Site_5, MRI_Track_Scan_Location_4