In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV

from scipy.io import loadmat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# Step 1: Load Data

### Load Categorical Data

In [2]:
train_cat = pd.read_excel('TRAIN_CATEGORICAL_METADATA_new.xlsx')
train_cat = pd.DataFrame(train_cat)
train_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,00aIpNTbG5uh,2019,4,1.0,0.0,3.0,21.0,45.0,,
1,00fV0OyyoLfw,2017,1,0.0,9.0,2.0,21.0,0.0,21.0,45.0
2,04X1eiS79T4B,2017,1,1.0,2.0,2.0,9.0,0.0,,
3,05ocQutkURd6,2018,1,3.0,8.0,2.0,18.0,10.0,18.0,0.0
4,06YUNBA9ZRLq,2018,1,0.0,1.0,2.0,12.0,0.0,,


In [None]:
train_cat.columns

### Load Functional Connectome Matrices

In [3]:
train_fcm = pd.read_csv('TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
train_fcm.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,0.224985,0.397448,0.422966,0.184642,0.305549,0.420349,0.016328,0.561864,0.47117,0.365221
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,0.217546,-0.014549,0.00044,-0.096451,0.454501,0.343916,0.167313,0.607656,0.550623,0.503176
2,4PAQp1M6EyAo,-0.116833,0.458408,0.260703,0.639031,0.769337,0.442528,0.63711,0.19201,0.520379,...,0.342487,-0.021141,-0.037836,0.075069,0.412712,0.292708,0.391005,0.461544,0.508912,0.624232
3,obEacy4Of68I,0.199688,0.752714,0.658283,0.575096,0.692867,0.645789,0.52275,0.412188,0.530843,...,0.103562,-0.178313,0.210983,-0.018666,0.436313,0.592982,0.216205,0.341272,0.440313,0.558193
4,s7WzzDcmDOhF,0.227321,0.613268,0.621447,0.562673,0.736709,0.589813,0.266676,0.359668,0.300771,...,-0.164956,0.007064,-0.120904,-0.488095,0.493575,-0.215361,0.210685,0.05585,0.119065,0.108273


In [None]:
train_fcm.columns

### Load Quantitative Data

In [4]:
train_qt = pd.read_excel('TRAIN_QUANTITATIVE_METADATA_new.xlsx')
train_qt = pd.DataFrame(train_qt)
train_qt.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,00aIpNTbG5uh,100.0,13.0,3.0,15.0,44.0,14.0,20.0,27.0,3.0,17.0,4.0,11.0,5.0,8.0,6.0,2.0,9.0,14.274127
1,00fV0OyyoLfw,92.27,14.0,3.0,12.0,35.0,25.0,28.0,30.0,5.0,20.0,4.0,13.0,5.0,8.0,7.0,3.0,8.0,
2,04X1eiS79T4B,86.67,14.0,3.0,21.0,37.0,18.0,26.0,28.0,3.0,24.0,7.0,10.0,10.0,7.0,14.0,7.0,7.0,13.463381
3,05ocQutkURd6,93.34,14.0,3.0,11.0,42.0,15.0,20.0,28.0,0.0,5.0,0.0,3.0,0.0,3.0,2.0,2.0,6.0,9.572553
4,06YUNBA9ZRLq,0.0,14.0,8.0,12.0,35.0,22.0,12.0,24.0,6.0,23.0,7.0,15.0,8.0,9.0,8.0,1.0,4.0,6.654574


In [None]:
train_qt.columns

### Load Solutions for Training Data

In [5]:
train_sol = pd.read_excel('TRAINING_SOLUTIONS.xlsx')
train_sol = pd.DataFrame(train_sol)
train_sol.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [None]:
train_sol.columns

# Step 2: Exploratory Data Analysis

### Basic Information of Categorical Data

In [None]:
train_cat.info()

In [None]:
train_cat.describe()

In [None]:
# enroll year
sns.countplot(x='Basic_Demos_Enroll_Year', data=train_cat[['Basic_Demos_Enroll_Year']])
plt.title(f"Distribution of Basic_Demos_Enroll_Year")
plt.show()

In [None]:
# study site
sns.countplot(x='Basic_Demos_Study_Site', data=train_cat[['Basic_Demos_Study_Site']])
plt.title(f"Distribution of Basic_Demos_Study_Site")
plt.show()

Basic_Demos_Study_Site Mapping: 1=Staten Island; 2=MRV; 3=Midtown; 4=Harlem; 5=SI RUMC

In [None]:
# child ethnicity
sns.countplot(x='PreInt_Demos_Fam_Child_Ethnicity', data=train_cat[['PreInt_Demos_Fam_Child_Ethnicity']])
plt.title(f"Distribution of PreInt_Demos_Fam_Child_Ethnicity")
plt.show()

PreInt_Demos_Fam_Child_Ethnicity Mapping: 0= Not Hispanic or Latino; 1= Hispanic or Latino; 2= Decline to specify; 3= Unknown

In [None]:
# child race
sns.countplot(x='PreInt_Demos_Fam_Child_Race', data=train_cat[['PreInt_Demos_Fam_Child_Race']])
plt.title(f"Distribution of PreInt_Demos_Fam_Child_Race")
plt.show()

PreInt_Demos_Fam_Child_Race Mapping: 0= White/Caucasian; 1= Black/African American; 2= Hispanic; 3= Asian; 4= Indian; 5= Native American Indian; 6= American Indian/Alaskan Native; 7= Native Hawaiian/Other Pacific Islander; 8= Two or more races; 9= Other race; 10= Unknown; 11=Choose not to specify

In [None]:
# scan location
sns.countplot(x='MRI_Track_Scan_Location', data=train_cat[['MRI_Track_Scan_Location']])
plt.title(f"Distribution of MRI_Track_Scan_Location")
plt.show()

MRI_Track_Scan_Location Mapping: 1=Staten Island; 2=RUBIC; 3=CBIC; 4=CUNY

In [None]:
# parent 1 education
sns.countplot(x='Barratt_Barratt_P1_Edu', data=train_cat[['Barratt_Barratt_P1_Edu']])
plt.title(f"Distribution of Barratt_Barratt_P1_Edu")
plt.show()

Barratt_Barratt_P1_Edu Mapping: 3=Less than 7th grade; 6=Junior high/Middle school (9th grade); 9=Partial high school (10th or 11th grade); 12=High school graduate; 15=Partial college (at least one year); 18=College education; 21=Graduate degree

In [None]:
# parent 1 occupation
sns.countplot(x='Barratt_Barratt_P1_Occ', data=train_cat[['Barratt_Barratt_P1_Occ']])
plt.title(f"Distribution of Barratt_Barratt_P1_Occ")
plt.show()

Barratt_Barratt_P1_Occ Mapping: 0=Homemaker, stay at home parent.  
5=Day laborer, janitor, house cleaner, farm worker, food counter sales, food preparation worker, busboy.  
10=Garbage collector, short-order cook, cab driver, shoe sales, assembly line workers, masons, baggage porter.   
15=Painter, skilled construction trade, sales clerk, truck driver, cook, sales counter or general office clerk.   
20=Automobile mechanic, typist, locksmith, farmer, carpenter, receptionist, construction laborer, hairdresser.  
25=Machinist, musician, bookkeeper, secretary, insurance sales, cabinet maker, personnel specialist, welder.  
30=Supervisor, librarian, aircraft mechanic, artist and artisan, electrician, administrator, military enlisted personnel, buyer.  
35=Nurse, skilled technician, medical technician, counselor, manager, police and fire personnel, financial manager, physical, occupational, speech therapist.  
40=Mechanical, nuclear, and electrical engineer, educational administrator, veterinarian, military officer, elementary, high school and special education teacher.  
45=Physician, attorney, professor, chemical and aerospace engineer, judge, CEO, senior manager, public official, psychologist, pharmacist, accountant.  

In [None]:
# parent 2 education
train_cat['Barratt_Barratt_P2_Edu'].value_counts()
sns.countplot(x='Barratt_Barratt_P2_Edu', data=train_cat[['Barratt_Barratt_P2_Edu']])
plt.title(f"Distribution of Barratt_Barratt_P2_Edu")
plt.show()

Barratt_Barratt_P2_Edu Mapping: 3=Less than 7th grade; 6=Junior high/Middle school (9th grade); 9=Partial high school (10th or 11th grade); 12=High school graduate; 15=Partial college (at least one year); 18=College education; 21=Graduate degree

In [None]:
# parent 2 occupation
train_cat['Barratt_Barratt_P2_Occ'].value_counts()
sns.countplot(x='Barratt_Barratt_P2_Occ', data=train_cat[['Barratt_Barratt_P2_Occ']])
plt.title(f"Distribution of Barratt_Barratt_P2_Occ")
plt.show()

Barratt_Barratt_P2_Occ Mapping: 0=Homemaker, stay at home parent.
5=Day laborer, janitor, house cleaner, farm worker, food counter sales, food preparation worker, busboy.
10=Garbage collector, short-order cook, cab driver, shoe sales, assembly line workers, masons, baggage porter.
15=Painter, skilled construction trade, sales clerk, truck driver, cook, sales counter or general office clerk.
20=Automobile mechanic, typist, locksmith, farmer, carpenter, receptionist, construction laborer, hairdresser.
25=Machinist, musician, bookkeeper, secretary, insurance sales, cabinet maker, personnel specialist, welder.
30=Supervisor, librarian, aircraft mechanic, artist and artisan, electrician, administrator, military enlisted personnel, buyer.
35=Nurse, skilled technician, medical technician, counselor, manager, police and fire personnel, financial manager, physical, occupational, speech therapist.
40=Mechanical, nuclear, and electrical engineer, educational administrator, veterinarian, military officer, elementary, high school and special education teacher.
45=Physician, attorney, professor, chemical and aerospace engineer, judge, CEO, senior manager, public official, psychologist, pharmacist, accountant.

Observation: samples of parents with lower education level or certain types of occupations are small, may lead to bias 

### Basic Information of Quantitative Data

In [None]:
train_qt.info()

In [None]:
train_qt.describe()

In [None]:
# total score
sns.histplot(train_qt['EHQ_EHQ_Total'], bins=40, kde = True)
plt.suptitle("EHQ_EHQ_Total Distributions")
plt.xlabel('EHQ_EHQ_Total')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# color vision score
train_qt['ColorVision_CV_Score'].hist(figsize=(12, 10), bins=14)
plt.suptitle("ColorVision_CV_Score Distributions")
plt.xlabel('ColorVision_CV_Score')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# corporal punishment score
train_qt['APQ_P_APQ_P_CP'].hist(figsize=(12, 10), bins=12)
plt.suptitle("APQ_P_APQ_P_CP Distributions")
plt.xlabel('APQ_P_APQ_P_CP')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# inconsistent discipline score
train_qt['APQ_P_APQ_P_ID'].hist(figsize=(12, 10), bins=28)
plt.suptitle("APQ_P_APQ_P_ID Distributions")
plt.xlabel('APQ_P_APQ_P_ID')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# involvement score
train_qt['APQ_P_APQ_P_INV'].hist(figsize=(12, 10), bins=50)
plt.suptitle("APQ_P_APQ_P_INV Distributions")
plt.xlabel('APQ_P_APQ_P_INV')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# other discipline practices score
train_qt['APQ_P_APQ_P_OPD'].hist(figsize=(12, 10), bins=28)
plt.suptitle("APQ_P_APQ_P_OPD Distributions")
plt.xlabel('APQ_P_APQ_P_OPD')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# poor monitoring / supervision score
train_qt['APQ_P_APQ_P_PM'].hist(figsize=(12, 10), bins=37)
plt.suptitle("APQ_P_APQ_P_PM Distributions")
plt.xlabel('APQ_P_APQ_P_PM')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# positive parenting score
train_qt['APQ_P_APQ_P_PP'].hist(figsize=(12, 10), bins=30)
plt.suptitle("APQ_P_APQ_P_PP Distributions")
plt.xlabel('APQ_P_APQ_P_PP')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# conduct problems scale
train_qt['SDQ_SDQ_Conduct_Problems'].hist(figsize=(12, 10), bins=10)
plt.suptitle("SDQ_SDQ_Conduct_Problems Distributions")
plt.xlabel('SDQ_SDQ_Conduct_Problems')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# total difficulties score
train_qt['SDQ_SDQ_Difficulties_Total'].hist(figsize=(12, 10), bins=34)
plt.suptitle("SDQ_SDQ_Difficulties_Total Distributions")
plt.xlabel('SDQ_SDQ_Difficulties_Total')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# emotional problems score
train_qt['SDQ_SDQ_Emotional_Problems'].hist(figsize=(12, 10), bins=10)
plt.suptitle("SDQ_SDQ_Emotional_Problems Distributions")
plt.xlabel('SDQ_SDQ_Emotional_Problems')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# externalizing score
train_qt['SDQ_SDQ_Externalizing'].hist(figsize=(12, 10), bins=20)
plt.suptitle("SDQ_SDQ_Externalizing Distributions")
plt.xlabel('SDQ_SDQ_Externalizing')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# generating impact score
train_qt['SDQ_SDQ_Generating_Impact'].hist(figsize=(12, 10), bins=10)
plt.suptitle("SDQ_SDQ_Generating_Impact Distributions")
plt.xlabel('SDQ_SDQ_Generating_Impact')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# hyperactivity scale
train_qt['SDQ_SDQ_Hyperactivity'].hist(figsize=(12, 10), bins=10)
plt.suptitle("SDQ_SDQ_Hyperactivity Distributions")
plt.xlabel('SDQ_SDQ_Hyperactivity')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# internalizing score
train_qt['SDQ_SDQ_Internalizing'].hist(figsize=(12, 10), bins=17)
plt.suptitle("SDQ_SDQ_Internalizing Distributions")
plt.xlabel('SDQ_SDQ_Internalizing')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# peer problems scale
train_qt['SDQ_SDQ_Peer_Problems'].hist(figsize=(12, 10), bins=9)
plt.suptitle("SDQ_SDQ_Peer_Problems Distributions")
plt.xlabel('SDQ_SDQ_Peer_Problems')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# prosocial scale
train_qt['SDQ_SDQ_Prosocial'].hist(figsize=(12, 10), bins=10)
plt.suptitle("SDQ_SDQ_Prosocial Distributions")
plt.xlabel('SDQ_SDQ_Prosocial')
plt.ylabel('Frequency Count')
plt.show()

In [None]:
# age of scan
train_qt['MRI_Track_Age_at_Scan'].hist(figsize=(12, 10), bins=np.arange(0, 22, 1))
plt.suptitle("MRI_Track_Age_at_Scan Distributions")
plt.xlabel('MRI_Track_Age_at_Scan')
plt.ylabel('Frequency Count')
plt.show()

### Basic Information of Functional Connectome Matrices

In [None]:
train_fcm.info()

In [None]:
train_fcm.describe()

In [None]:
train_fcm.isna().sum().sum()

### Basic Information of Solutions

In [None]:
train_sol.info()

In [None]:
train_sol.describe()

In [None]:
# ADHD outcome
train_sol['ADHD_Outcome'].value_counts().plot(kind='bar')
plt.title('ADHD Outcome')
plt.xlabel('Outcome (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

In [None]:
# ADHD outcome
train_sol['Sex_F'].value_counts().plot(kind='bar')
plt.title('Sex_F')
plt.xlabel('Sex (0 = Male, 1 = Female)')
plt.ylabel('Count')
plt.show()

Observation: there exists gender inbalance.

In [None]:
# ADHD outcome by gender
sns.countplot(data=train_sol, x='Sex_F', hue=train_sol['ADHD_Outcome'])
plt.title('ADHD Outcome by Gender')
plt.show()

Observation: A higher ADHD diagnosed rate is found is males, which may not be strongly reliable since the sample size of female candidates is only half of male candidates.

### Correlation Information

#### Quantitive Data vs Solutions

In [None]:
train_qt_copy = train_qt.copy()
train_qt_copy['ADHD_Outcome'] = train_sol['ADHD_Outcome']
train_qt_copy['Sex_F'] = train_sol['Sex_F']

In [None]:
corr_matrix = train_qt_copy.corr()
sns.heatmap(corr_matrix[18:],
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0)

In [None]:
# Laterality Index (Score) vs ADHD Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='ADHD_Outcome', y='EHQ_EHQ_Total', data=train_qt_copy)
plt.title('Laterality Index (Score) vs ADHD Outcome')
plt.xlabel('ADHD Outcome')
plt.ylabel('Laterality Index (Score)')
plt.show()

Observation: ADHD children tends to have a higher median on the laterality index (score) than non-ADHD children. In addition, the laterality indexes (scores) for ADHD children are more concentrated to positive range. 

In [None]:
# Poor Monitoring/Supervision Score vs ADHD Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='ADHD_Outcome', y='APQ_P_APQ_P_PM', data=train_qt_copy)
plt.title('Poor Monitoring/Supervision Score vs ADHD Outcome')
plt.xlabel('ADHD Outcome')
plt.ylabel('Poor Monitoring/Supervision Score')
plt.show()

In [None]:
# Positive Parenting Score vs ADHD Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='ADHD_Outcome', y='APQ_P_APQ_P_PP', data=train_qt_copy)
plt.title('Positive Parenting Score vs ADHD Outcome')
plt.xlabel('ADHD Outcome')
plt.ylabel('Positive Parenting Score')
plt.show()

Observation: The correlation between parenting scores and ADHD outcome are not very strong.

In [None]:
# Emotional Problems Scale vs ADHD Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='ADHD_Outcome', y='SDQ_SDQ_Emotional_Problems', data=train_qt_copy)
plt.title('Emotional Problems Scale vs ADHD Outcome')
plt.xlabel('ADHD Outcome')
plt.ylabel('Emotional Problems Scale')
plt.show()

Observation: Candidates diagnosed with ADHD tend to have a higher median on the SDQ Emotional Problems scale compared to non-ADHD candidates. Additionally, ADHD group displays greater variability, as evidenced by its extended range. This suggests that emotional problems are not only more pronounced but also more diverse within the ADHD group.

In [None]:
# Emotional Problems Scale vs Gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='Sex_F', y='SDQ_SDQ_Emotional_Problems', data=train_qt_copy)
plt.title('Emotional Problems Scale vs ADHD Outcome')
plt.xlabel('Sex_F')
plt.ylabel('Emotional Problems Scale')
plt.show()

Observation: The middle 50% SDQ Emotional Problems scale is higher in female candidates than male candidates. Additionally, female group displays greater variability, as evidenced by its extended range. This suggests that emotional problems are more diverse within females.

#### Categorical vs Solutions

In [None]:
train_cat_copy = train_cat.copy()
train_cat_copy['ADHD_Outcome'] = train_sol['ADHD_Outcome']
train_cat_copy['ADHD_Outcome'] = train_sol['Sex_F']

In [None]:
corr_matrix = train_cat_copy.corr()
sns.heatmap(corr_matrix[9:],
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0)

In [None]:
# Child Race vs ADHD outcome
sns.countplot(data=train_cat, x='PreInt_Demos_Fam_Child_Race', hue=train_sol['ADHD_Outcome'])
plt.title('Child Race vs ADHD Outcome')
plt.show()

In [None]:
adhd_percentages = train_cat_copy.groupby('PreInt_Demos_Fam_Child_Race')['ADHD_Outcome'].mean()
print(adhd_percentages)

Observation: The samples from races with index 4,7,10,11 are too small. Smaller sample sizes can lead to greater variability and make the proportions more susceptible to outliers. Children with Black/African American races tends to have a significantly higher diagnose rate.

In [None]:
# Parent1 Education Level vs ADHD outcome
sns.countplot(data=train_cat, x='Barratt_Barratt_P1_Edu', hue=train_sol['ADHD_Outcome'])
plt.title('Parent1 Education Level vs ADHD Outcome')
plt.show()

In [None]:
parent1_edu_adhd_percentages = train_cat_copy.groupby('Barratt_Barratt_P1_Edu')['ADHD_Outcome'].mean()
print(parent1_edu_adhd_percentages)

In [None]:
# Parent2 Education Level vs ADHD outcome
sns.countplot(data=train_cat, x='Barratt_Barratt_P2_Edu', hue=train_sol['ADHD_Outcome'])
plt.title('Parent1 Education Level vs ADHD Outcome')
plt.show()

In [None]:
parent2_edu_adhd_percentages = train_cat_copy.groupby('Barratt_Barratt_P2_Edu')['ADHD_Outcome'].mean()
print(parent2_edu_adhd_percentages)

In [None]:
# Both Parents Education Levels vs ADHD outcome
train_cat_copy['Edu_Sum'] = train_cat_copy['Barratt_Barratt_P1_Edu'] + train_cat_copy['Barratt_Barratt_P2_Edu']
sns.countplot(data=train_cat_copy, x='Edu_Sum', hue=train_sol['ADHD_Outcome'])
plt.title('Both Parents Education Levels vs ADHD Outcome')
plt.show()

In [None]:
parents_edu_adhd_percentages = train_cat_copy.groupby('Edu_Sum')['ADHD_Outcome'].mean()
print(parents_edu_adhd_percentages)

Observation: The proportion of diagnose various a lot within differnt education level groups, especially for groups with smaller indexes, since the sample sizes are too small in these groups.

# Step 3: Data Pre-processing

### Encode Categorical Training Data (One - Hot Coding)

In [6]:
# change integer into category
for col in train_cat.select_dtypes(include=['int', 'float']).columns:
    train_cat[col] = train_cat[col].astype('category')
    train_cat[col] = train_cat[col].cat.rename_categories(lambda x: int(x) if not pd.isna(x) else x)

In [7]:
# encode
columns_to_encode = train_cat.columns[1:].tolist()
train_encoded = pd.get_dummies(train_cat[columns_to_encode], drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

  train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [8]:
# encodes dataframe
train_cat_encoded = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)
train_cat_encoded.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,...,Barratt_Barratt_P2_Edu_21,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_10,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_20,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45
0,00aIpNTbG5uh,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,00fV0OyyoLfw,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,04X1eiS79T4B,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,05ocQutkURd6,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,06YUNBA9ZRLq,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Encode Categorical Testing Data (One - Hot Coding)

In [9]:
# load test data
test_cat = pd.read_excel('TEST_CATEGORICAL.xlsx')
test_cat = pd.DataFrame(test_cat)
test_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,Cfwaf5FX7jWK,2022,4,0.0,0.0,4,21.0,30.0,18.0,30.0
1,vhGrzmvA3Hjq,2023,4,0.0,0.0,4,21.0,45.0,,30.0
2,ULliyEXjy4OV,2022,4,0.0,0.0,4,21.0,40.0,18.0,40.0
3,LZfeAb1xMtql,2022,4,0.0,0.0,3,21.0,45.0,21.0,45.0
4,EnFOUv0YK1RG,2022,4,2.0,0.0,4,18.0,0.0,21.0,45.0


In [10]:
test_qt = pd.read_excel('TEST_QUANTITATIVE_METADATA.xlsx')
test_qt = pd.DataFrame(test_qt)
test_qt.head()

Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,60.03,14.0,5.0,16.0,41.0,19.0,11.0,26.0,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,86.71,12.0,3.0,13.0,43.0,18.0,15.0,28.0,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,26.68,13.0,3.0,14.0,36.0,16.0,14.0,25.0,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,93.38,13.0,3.0,19.0,41.0,17.0,18.0,27.0,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,-93.38,14.0,3.0,13.0,42.0,19.0,16.0,28.0,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [11]:
test_fcm = pd.read_csv('TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_fcm.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,Cfwaf5FX7jWK,0.54848,0.713607,0.557319,0.524369,0.693364,0.770032,0.724406,0.390118,0.547912,...,0.080423,-0.054581,-0.088163,-0.028574,0.444847,0.350149,-0.012601,0.66575,0.560565,0.555732
1,vhGrzmvA3Hjq,0.42774,0.363022,0.402862,0.363003,0.534558,0.345347,0.409471,0.303328,0.402515,...,0.198009,-0.000724,0.083122,0.033043,0.687497,0.306229,0.717485,0.461809,0.559632,0.350027
2,ULliyEXjy4OV,0.139572,0.390106,-0.087041,0.196852,0.088148,0.023843,0.381782,0.068979,0.377488,...,0.051319,0.02363,-0.056819,0.117396,0.576086,0.517831,0.527044,0.605038,0.609856,0.750987
3,LZfeAb1xMtql,0.133561,0.778326,0.416355,0.47184,0.56846,0.63366,0.501113,0.345461,0.467943,...,0.046183,-0.238962,0.121868,-0.26097,0.646818,0.594902,0.608156,0.595459,0.683189,0.542296
4,EnFOUv0YK1RG,0.126699,0.575446,0.509422,0.363193,0.427544,0.449924,0.451796,0.223927,0.298248,...,0.315734,0.002234,0.290791,0.344149,0.480214,0.539824,0.447322,0.293088,0.148529,0.539823


In [12]:
# encoding
for col in test_cat.select_dtypes(include=['int', 'float']).columns:
    test_cat[col] = test_cat[col].astype('category')
    test_cat[col] = test_cat[col].cat.rename_categories(lambda x: int(x) if not pd.isna(x) else x)

test_encoded = pd.get_dummies(test_cat[columns_to_encode], drop_first=True)
test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Add missing columns with 0 values
missing_cols = set(train_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0 

# Ensure test_encoded columns are in the same order as train_encoded
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)

test_cat_encoded = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)
test_cat_encoded.head()

  test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,...,Barratt_Barratt_P2_Edu_21,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_10,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_20,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [13]:
# check whether the columns are the same
columns_A = set(train_cat_encoded.columns)
columns_B = set(test_cat_encoded.columns)
extra_columns = columns_A - columns_B
print(extra_columns)

set()


### Merge Training Data

In [14]:
train_cat_fcm = pd.merge(train_cat_encoded, train_fcm, on = 'participant_id')
train = pd.merge(train_cat_fcm, train_qt, on = 'participant_id')

In [15]:
train.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,00aIpNTbG5uh,0,0,0,1,0,0,0,1,1,...,3.0,17.0,4.0,11.0,5.0,8.0,6.0,2.0,9.0,14.274127
1,00fV0OyyoLfw,0,1,0,0,0,0,0,0,0,...,5.0,20.0,4.0,13.0,5.0,8.0,7.0,3.0,8.0,
2,04X1eiS79T4B,0,1,0,0,0,0,0,0,1,...,3.0,24.0,7.0,10.0,10.0,7.0,14.0,7.0,7.0,13.463381
3,05ocQutkURd6,0,0,1,0,0,0,0,0,0,...,0.0,5.0,0.0,3.0,0.0,3.0,2.0,2.0,6.0,9.572553
4,06YUNBA9ZRLq,0,0,1,0,0,0,0,0,0,...,6.0,23.0,7.0,15.0,8.0,9.0,8.0,1.0,4.0,6.654574


### Merge Testing Data

In [16]:
test_cat_fcm = pd.merge(test_cat_encoded, test_fcm, on = 'participant_id')
test = pd.merge(test_cat_fcm, test_qt, on = 'participant_id')

In [17]:
test.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,PreInt_Demos_Fam_Child_Ethnicity_1,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,0,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


### Fill-in NAs

In [18]:
print(train.isna().sum())

participant_id                    0
Basic_Demos_Enroll_Year_2016      0
Basic_Demos_Enroll_Year_2017      0
Basic_Demos_Enroll_Year_2018      0
Basic_Demos_Enroll_Year_2019      0
                               ... 
SDQ_SDQ_Hyperactivity             9
SDQ_SDQ_Internalizing             9
SDQ_SDQ_Peer_Problems             9
SDQ_SDQ_Prosocial                 9
MRI_Track_Age_at_Scan           360
Length: 19972, dtype: int64


In [19]:
print(test.isna().sum())

participant_id                   0
Basic_Demos_Enroll_Year_2016     0
Basic_Demos_Enroll_Year_2017     0
Basic_Demos_Enroll_Year_2018     0
Basic_Demos_Enroll_Year_2019     0
                                ..
SDQ_SDQ_Hyperactivity           30
SDQ_SDQ_Internalizing           30
SDQ_SDQ_Peer_Problems           30
SDQ_SDQ_Prosocial               30
MRI_Track_Age_at_Scan            0
Length: 19972, dtype: int64


In [20]:
# fill with avg for training
for col in train.columns:
    if train[col].isna().sum() > 0:  
        if train[col].dtype in ['float64', 'int64']:  
            train[col] = train[col].fillna(train[col].mean())  
print(train.isna().sum().sum())

0


In [21]:
# fill with avg for test
for col in test.columns:
    if test[col].isna().sum() > 0:  
        if test[col].dtype in ['float64', 'int64']:  
            test[col] = test[col].fillna(test[col].mean())  
            
print(train.isna().sum().sum())

0


# Step 4: Feature Selection and Reduction

### Set X, Y Variables

In [49]:
X_train = train.drop(columns = ['participant_id'])
Y_train = train_sol.drop(columns = ['participant_id'])
X_test = test.drop(columns = ['participant_id'])

In [50]:
X_train.to_csv('X_train.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

### Dimension Reduction

#### Using PCA

In [51]:
from sklearn.decomposition import PCA

In [52]:
# standarization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [53]:
# apply PCA
pca = PCA(n_components=300, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_train_pca = pd.DataFrame(X_train_pca, columns=[f"PC_{i+1}" for i in range(300)])

In [54]:
# handle testing data
X_test_scaled = scaler.transform(X_test) 
X_test_pca = pca.transform(X_test_scaled)
X_test_pca = pd.DataFrame(X_test_pca, columns=[f"PC_{i+1}" for i in range(300)])

In [55]:
X_train_pca.head()

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_291,PC_292,PC_293,PC_294,PC_295,PC_296,PC_297,PC_298,PC_299,PC_300
0,-26.339203,-27.243252,7.868414,8.058895,5.694577,1.079606,-14.752361,-7.394403,-5.968838,8.274657,...,-3.432478,-1.833952,0.133211,2.26315,0.23731,-6.762984,1.534908,-0.824387,-3.537631,-2.179431
1,-18.76294,-22.289053,-0.61907,8.944426,-5.221229,6.41975,0.123195,1.180756,3.418873,-10.951874,...,-1.800813,1.02165,10.657346,4.876396,3.488631,1.34447,-4.39755,2.317746,3.271732,0.488517
2,-30.373937,-28.125211,12.391833,-22.567269,6.277987,-7.603991,-6.419119,20.331909,7.994128,-3.200669,...,-4.012935,-0.270318,0.502623,1.356906,0.77868,3.225318,-0.099661,-3.496665,-0.642613,-2.590956
3,-16.897256,-37.439588,5.52611,17.926885,-13.946446,-1.964109,-16.182728,1.301811,6.863134,4.437317,...,5.319254,-2.558152,-3.095462,9.001796,4.787021,-0.771864,-3.814739,-7.628804,0.813089,-0.836619
4,12.554654,-23.709594,-16.291883,16.155963,-10.449767,-7.567259,-19.68493,24.211969,8.241264,8.568894,...,-6.151301,-0.27555,-3.717248,-2.785805,1.463325,0.234121,0.654377,2.378198,-2.692094,2.813503


In [56]:
X_test_pca.head()

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_291,PC_292,PC_293,PC_294,PC_295,PC_296,PC_297,PC_298,PC_299,PC_300
0,16.356892,-12.538896,22.783273,6.888916,7.54646,4.181382,4.59431,-8.056939,8.936694,21.145523,...,-3.17196,-1.882782,-0.218939,1.944184,2.481831,-1.894065,2.817681,-0.744162,-0.094929,-3.507715
1,-9.151662,-22.926872,12.714822,-18.758953,19.173206,3.442495,3.408281,-11.135979,-9.270568,-10.154949,...,0.563609,2.191283,-0.251642,-0.006796,-0.820546,-0.77162,3.100774,2.044627,2.185971,4.06494
2,10.870549,11.898799,-16.805128,-1.557039,-15.867192,0.927583,-6.638887,-12.702801,-7.173258,-6.750784,...,0.357754,1.363342,1.541419,-2.550857,4.097598,-0.998787,-0.292918,5.584846,1.687718,-0.392405
3,14.263327,-21.136999,-2.228268,-4.658938,-2.978643,-4.333417,-18.287837,-8.415411,6.535633,0.134165,...,0.03122,-0.554693,1.836577,0.486553,2.845753,-0.65735,1.521727,1.208765,-2.396945,-2.46189
4,31.119625,-11.501271,-5.843186,1.9488,-7.918959,4.355121,8.272574,10.803943,1.196001,1.999562,...,-2.527769,1.132403,-3.011622,-1.403938,-0.455596,-0.110453,-0.167067,2.037502,3.756784,-2.52849


#### Using Coefficients

In [34]:
from sklearn.linear_model import LogisticRegression

In [None]:
# use logistic model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train['Sex_F'])

# Get coefficients for gender prediction
coefficients = pd.Series(model.coef_[0], index=X_train.columns)

# Select top features for gender prediction
top_features = coefficients.abs().nlargest(10)
print(top_features)

#Plot the top 10 coefficents for gender outcome
plt.figure(figsize=(10,6))
top_features.sort_values().plot(kind='barh', color='skyblue')
plt.title('Top 10 Features for Gender Outcome')
plt.ylabel('Features')
plt.xlabel('Absolute Coefficient Value')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# target on ADHD outcome
# use logistic model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train['ADHD_Outcome'])

# Get coefficients for gender prediction
coefficients = pd.Series(model.coef_[0], index=X_train.columns)

# Select top features for gender prediction
top_features = coefficients.abs().nlargest(10)
print(top_features)

#Plot the top 10 coefficents for ADHD outcome
plt.figure(figsize=(10,6))
top_features.sort_values().plot(kind='barh', color='skyblue')
plt.title('Top 10 Features for ADHD Outcome')
plt.ylabel('Features')
plt.xlabel('Absolute Coefficient Value')
plt.xticks(rotation=45, ha='right')
plt.show()

#### Using L1 regularization

In [None]:
# for gender
model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(X_train, Y_train['Sex_F'])

selected_features_gender = X_train.columns[model.coef_[0] != 0]
print(selected_features_gender)

In [None]:
# for ADHD
model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(X_train, Y_train['ADHD_Outcome'])

selected_features_ADHD = X_train.columns[model.coef_[0] != 0]
print(selected_features_ADHD)

In [None]:
# keep the common selected features
common_features = list(set(selected_features_ADHD) & set(selected_features_gender))
X_train_reduced = X_train[common_features]
X_test_reduced = X_test[common_features]

## Step 5: Modeling

In [75]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier

### Random Forest (unweighted)

In [None]:
rf_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.fit(X_train, Y_train)
rf_preds = rf_model.predict(X_test)

In [None]:
results_rf = pd.DataFrame({
    "ADHD_Outcome": rf_preds[:, 0], 
    "Sex_F": rf_preds[:, 1]  
})
results_rf.insert(0, test.columns[0], test.iloc[:, 0])

In [None]:
results_rf.describe()

### Logistic Regression (unweighted)

In [58]:
logistic_model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
logistic_model.fit(X_train, Y_train)
logistic_preds = logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
results_logistic = pd.DataFrame({
    "ADHD_Outcome": logistic_preds[:, 0], 
    "Sex_F": logistic_preds[:, 1]  
})
results_logistic.insert(0, test.columns[0], test.iloc[:, 0])
results_logistic.set_index(results_logistic.columns[0], inplace=True)

In [62]:
results_logistic.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Cfwaf5FX7jWK,1,0
vhGrzmvA3Hjq,1,1
ULliyEXjy4OV,1,0
LZfeAb1xMtql,1,1
EnFOUv0YK1RG,1,1


In [63]:
results_logistic.to_csv("log_pred.csv", index = 'False')

### XGBoost (unweighted)

In [76]:
xgb_model = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
xgb_model.fit(X_train, Y_train)
xgb_preds = xgb_model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [79]:
results_xgb = pd.DataFrame({
    "ADHD_Outcome": xgb_preds[:, 0], 
    "Sex_F": xgb_preds[:, 1]  
})
results_xgb.insert(0, test.columns[0], test.iloc[:, 0])
results_xgb.set_index(results_xgb.columns[0], inplace=True)

In [80]:
results_xgb.to_csv("xgb_pred.csv", index = 'False')

In [78]:
results_xgb.describe()

Unnamed: 0,ADHD_Outcome,Sex_F
count,304.0,304.0
mean,0.898026,0.098684
std,0.303113,0.298729
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,1.0


### Random Forest (weighted)

In [65]:
sample_weights = [2 if y[1] == 1 and y[0] == 1 else 1 for y in Y_train.values]

In [None]:
weighted_rf_model = RandomForestClassifier(random_state=42)
weighted_rf_model.fit(X_train, Y_train, sample_weight=sample_weights)
weighted_rf_preds = weighted_rf_model.predict(X_test)

In [None]:
weighted_results_rf = pd.DataFrame({
    "ADHD_Outcome": weighted_rf_preds[:, 0], 
    "Sex_F": weighted_rf_preds[:, 1]  
})
weighted_results_rf.insert(0, test.columns[0], test.iloc[:, 0])

In [None]:
weighted_results_rf.describe()

### Logistic Regression (weighted)

In [66]:
weighted_logistic_model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
weighted_logistic_model.fit(X_train, Y_train, sample_weight=sample_weights)
weighted_logistic_pred = weighted_logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
weighted_results_logistic = pd.DataFrame({
    "ADHD_Outcome": weighted_logistic_pred[:, 0], 
    "Sex_F": weighted_logistic_pred[:, 1]  
})

weighted_results_logistic.insert(0, test.columns[0], test.iloc[:, 0])
weighted_results_logistic.set_index(weighted_results_logistic.columns[0], inplace=True)

In [None]:
weighted_results_logistic.describe()

In [69]:
weighted_results_logistic.to_csv("weighted_log_pred.csv", index = 'False')

### XGBoost (weighted)

In [None]:
weighted_xgb_model = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
weighted_xgb_model.fit(X_train, Y_train, sample_weight=sample_weights)
weighted_xgb_preds = weighted_xgb_model.predict(X_test)

In [None]:
weighted_results_xgb = pd.DataFrame({
    "ADHD_Outcome": weighted_xgb_preds[:, 0], 
    "Sex_F": weighted_xgb_preds[:, 1]  
})
weighted_results_xgb.insert(0, test.columns[0], test.iloc[:, 0])

In [None]:
weighted_results_xgb.describe()

### Logistic (separate)

In [82]:
Y_train_ADHD = Y_train.ADHD_Outcome
Y_train_gender = Y_train.Sex_F

In [83]:
model = LogisticRegression()
model.fit(X_train, Y_train_ADHD)
predictions_ADHD = model.predict(X_test)
predictions = pd.DataFrame(predictions_ADHD, columns=['ADHD_Outcome'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [84]:
model = LogisticRegression()
model.fit(X_train, Y_train_gender)
predictions_gender = model.predict(X_test)
predictions['Sex_F'] = predictions_gender

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
predictions.describe()

Unnamed: 0,ADHD_Outcome,Sex_F
count,304.0,304.0
mean,0.6875,0.266447
std,0.464277,0.44283
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [89]:
predictions.insert(0, test.columns[0], test.iloc[:, 0])
predictions.set_index(weighted_results_logistic.columns[0], inplace=True)

In [90]:
predictions.to_csv("separate_log_pred.csv", index = 'False')

### Deep Learning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

def build_model(input_dim):
    # input
    inputs = Input(shape=(input_dim,), name="Input_Layer")

    # network
    x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    # gender
    gender_output = Dense(1, activation='sigmoid', name="Gender_Output")(x)

    # ADHD 预测分支
    adhd_output = Dense(1, activation='sigmoid', name="ADHD_Output")(x)

    # 构建模型
    model = Model(inputs=inputs, outputs=[gender_output, adhd_output])
    return model

# 构建模型
input_dim = 19900
model = build_model(input_dim)

# 编译模型，使用自定义权重的损失函数
losses = {
    "Gender_Output": "binary_crossentropy",  # 性别的损失函数
    "ADHD_Output": "binary_crossentropy"    # ADHD 的损失函数
}
loss_weights = {
    "Gender_Output": 1.0,  # 性别预测的权重
    "ADHD_Output": 2.0     # ADHD 的权重更高，反映其重要性
}

model.compile(optimizer=Adam(learning_rate=0.001), loss=losses, loss_weights=loss_weights, metrics=["accuracy"])

# 打印模型结构
model.summary()

# 定义早停回调函数
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 训练模型
history = model.fit(
    x=X_train,
    y={"Gender_Output": Y_train[:, 1], "ADHD_Output": Y_train[:, 0]},  # 分别对应性别和 ADHD 的 Y 值
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[early_stopping],
    verbose=1
)

# 在测试集上进行预测
predictions = model.predict(X_test)

# 将预测结果处理为二进制分类结果 (0 或 1)
gender_predictions = (predictions[0] > 0.5).astype(int).flatten()  # 性别预测
adhd_predictions = (predictions[1] > 0.5).astype(int).flatten()   # ADHD 预测

# 将结果保存为提交文件
submission = np.vstack((adhd_predictions, gender_predictions)).T  # 组合结果
np.savetxt("submission.csv", submission, fmt='%d', delimiter=',', header="ADHD,Gender", comments='')

print("预测完成，结果已保存为 submission.csv 文件")