# Importing libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#  Load Datasets

In [5]:
matrices_df = pd.read_csv("/Users/pavansharma/Downloads/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
matrices_df.set_index('participant_id', inplace=True)
print(matrices_df.head(5))

                0throw_1thcolumn  0throw_2thcolumn  0throw_3thcolumn  \
participant_id                                                         
70z8Q2xdTXM3            0.222930          0.527903          0.429966   
WHWymJu6zNZi            0.614765          0.577255          0.496127   
4PAQp1M6EyAo           -0.116833          0.458408          0.260703   
obEacy4Of68I            0.199688          0.752714          0.658283   
s7WzzDcmDOhF            0.227321          0.613268          0.621447   

                0throw_4thcolumn  0throw_5thcolumn  0throw_6thcolumn  \
participant_id                                                         
70z8Q2xdTXM3            0.060457          0.566489          0.315342   
WHWymJu6zNZi            0.496606          0.404686          0.439724   
4PAQp1M6EyAo            0.639031          0.769337          0.442528   
obEacy4Of68I            0.575096          0.692867          0.645789   
s7WzzDcmDOhF            0.562673          0.736709          0.5

In [6]:
categorical_df = pd.read_excel('/Users/pavansharma/Downloads/widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx')
categorical_df.set_index('participant_id', inplace=True)
print(categorical_df.head(5))

                Basic_Demos_Enroll_Year  Basic_Demos_Study_Site  \
participant_id                                                    
00aIpNTbG5uh                       2019                       4   
00fV0OyyoLfw                       2017                       1   
04X1eiS79T4B                       2017                       1   
05ocQutkURd6                       2018                       1   
06YUNBA9ZRLq                       2018                       1   

                PreInt_Demos_Fam_Child_Ethnicity  PreInt_Demos_Fam_Child_Race  \
participant_id                                                                  
00aIpNTbG5uh                                 1.0                          0.0   
00fV0OyyoLfw                                 0.0                          9.0   
04X1eiS79T4B                                 1.0                          2.0   
05ocQutkURd6                                 3.0                          8.0   
06YUNBA9ZRLq                                

In [7]:
quantitative_df = pd.read_excel('/Users/pavansharma/Downloads/widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
quantitative_df.set_index('participant_id', inplace=True)
print(quantitative_df.head(5))

                EHQ_EHQ_Total  ColorVision_CV_Score  APQ_P_APQ_P_CP  \
participant_id                                                        
00aIpNTbG5uh           100.00                  13.0             3.0   
00fV0OyyoLfw            92.27                  14.0             3.0   
04X1eiS79T4B            86.67                  14.0             3.0   
05ocQutkURd6            93.34                  14.0             3.0   
06YUNBA9ZRLq             0.00                  14.0             8.0   

                APQ_P_APQ_P_ID  APQ_P_APQ_P_INV  APQ_P_APQ_P_OPD  \
participant_id                                                     
00aIpNTbG5uh              15.0             44.0             14.0   
00fV0OyyoLfw              12.0             35.0             25.0   
04X1eiS79T4B              21.0             37.0             18.0   
05ocQutkURd6              11.0             42.0             15.0   
06YUNBA9ZRLq              12.0             35.0             22.0   

                APQ_P_APQ

In [8]:
solutions_df = pd.read_excel('/Users/pavansharma/Downloads/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')
solutions_df.set_index('participant_id', inplace=True)
print(quantitative_df.head(5))

                EHQ_EHQ_Total  ColorVision_CV_Score  APQ_P_APQ_P_CP  \
participant_id                                                        
00aIpNTbG5uh           100.00                  13.0             3.0   
00fV0OyyoLfw            92.27                  14.0             3.0   
04X1eiS79T4B            86.67                  14.0             3.0   
05ocQutkURd6            93.34                  14.0             3.0   
06YUNBA9ZRLq             0.00                  14.0             8.0   

                APQ_P_APQ_P_ID  APQ_P_APQ_P_INV  APQ_P_APQ_P_OPD  \
participant_id                                                     
00aIpNTbG5uh              15.0             44.0             14.0   
00fV0OyyoLfw              12.0             35.0             25.0   
04X1eiS79T4B              21.0             37.0             18.0   
05ocQutkURd6              11.0             42.0             15.0   
06YUNBA9ZRLq              12.0             35.0             22.0   

                APQ_P_APQ

# Check for missing values in each DataFrame

In [10]:


print("🔍 Missing values in matrices_df:")
print(matrices_df.isnull().sum().sum())

print("\n🔍 Missing values in categorical_df:")
print(categorical_df.isnull().sum().sum())

print("\n🔍 Missing values in quantitative_df:")
print(quantitative_df.isnull().sum().sum())

print("\n🔍 Missing values in solutions_df:")
print(solutions_df.isnull().sum().sum())


🔍 Missing values in matrices_df:
0

🔍 Missing values in categorical_df:
566

🔍 Missing values in quantitative_df:
549

🔍 Missing values in solutions_df:
0


# Handling missing values
### using mode imputation column-wise for categorical data
### using knn imputer for quantitaive instead of mean or median

In [12]:
categorical_df_imputed = categorical_df.apply(lambda col: col.fillna(col.mode().iloc[0]))
print(categorical_df_imputed.head(5))

                Basic_Demos_Enroll_Year  Basic_Demos_Study_Site  \
participant_id                                                    
00aIpNTbG5uh                       2019                       4   
00fV0OyyoLfw                       2017                       1   
04X1eiS79T4B                       2017                       1   
05ocQutkURd6                       2018                       1   
06YUNBA9ZRLq                       2018                       1   

                PreInt_Demos_Fam_Child_Ethnicity  PreInt_Demos_Fam_Child_Race  \
participant_id                                                                  
00aIpNTbG5uh                                 1.0                          0.0   
00fV0OyyoLfw                                 0.0                          9.0   
04X1eiS79T4B                                 1.0                          2.0   
05ocQutkURd6                                 3.0                          8.0   
06YUNBA9ZRLq                                

In [13]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
quantitative_filled_array = imputer.fit_transform(quantitative_df)
quantitative_df_filled = pd.DataFrame(quantitative_filled_array, 
                                      columns=quantitative_df.columns, 
                                      index=quantitative_df.index)
print(quantitative_df_filled.head(5))

                EHQ_EHQ_Total  ColorVision_CV_Score  APQ_P_APQ_P_CP  \
participant_id                                                        
00aIpNTbG5uh           100.00                  13.0             3.0   
00fV0OyyoLfw            92.27                  14.0             3.0   
04X1eiS79T4B            86.67                  14.0             3.0   
05ocQutkURd6            93.34                  14.0             3.0   
06YUNBA9ZRLq             0.00                  14.0             8.0   

                APQ_P_APQ_P_ID  APQ_P_APQ_P_INV  APQ_P_APQ_P_OPD  \
participant_id                                                     
00aIpNTbG5uh              15.0             44.0             14.0   
00fV0OyyoLfw              12.0             35.0             25.0   
04X1eiS79T4B              21.0             37.0             18.0   
05ocQutkURd6              11.0             42.0             15.0   
06YUNBA9ZRLq              12.0             35.0             22.0   

                APQ_P_APQ

In [14]:
print("🔍 Missing values in matrices_df:")
print(matrices_df.isnull().sum().sum())

print("\n🔍 Missing values in categorical_df:")
print(categorical_df_imputed.isnull().sum().sum())

print("\n🔍 Missing values in quantitative_df:")
print(quantitative_df_filled.isnull().sum().sum())

print("\n🔍 Missing values in solutions_df:")
print(solutions_df.isnull().sum().sum())

🔍 Missing values in matrices_df:
0

🔍 Missing values in categorical_df:
0

🔍 Missing values in quantitative_df:
0

🔍 Missing values in solutions_df:
0


#  Merging Datasets into one 

In [16]:
full_df = pd.concat([matrices_df, categorical_df, quantitative_df, solutions_df], axis=1)
full_df.fillna(0, inplace=True)

#  Correlation with ADHD_Outcome 

In [18]:
adhd_corr = full_df.drop(columns=['Sex_F'], errors='ignore').corr()['ADHD_Outcome'].drop('ADHD_Outcome')
top_features = adhd_corr.abs().sort_values(ascending=False).index.tolist()

# Iterate and compute cumulative R²

In [20]:
X = full_df[top_features]
y = full_df['ADHD_Outcome']

r2_values = []
selected_features = []

for feature in top_features:
    selected_features.append(feature)
    model = LinearRegression()
    model.fit(X[selected_features], y)
    y_pred = model.predict(X[selected_features])
    r2 = r2_score(y, y_pred)
    r2_values.append(r2)
    if r2 >= 0.85:
        break

# Final features that explain at least 85% of ADHD_Outcome variance

In [22]:
significant_features = selected_features
print(f"\n Selected {len(significant_features)} features explaining ≥85% of the variance:")
print(significant_features)


 Selected 887 features explaining ≥85% of the variance:
['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Prosocial', 'Basic_Demos_Enroll_Year', 'MRI_Track_Scan_Location', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_ID', '166throw_184thcolumn', '78throw_170thcolumn', '1throw_16thcolumn', '2throw_166thcolumn', '11throw_166thcolumn', '78throw_189thcolumn', '58throw_163thcolumn', '76throw_170thcolumn', '105throw_166thcolumn', '53throw_127thcolumn', '50throw_197thcolumn', '159throw_166thcolumn', '0throw_166thcolumn', '50throw_189thcolumn', '34throw_130thcolumn', '5throw_166thcolumn', '98throw_162thcolumn', '24throw_162thcolumn', '143throw_199thcolumn', '96throw_169thcolumn', '50throw_192thcolumn', '0throw_58thcolumn', '123throw_143thcolumn', '73throw_185thcolumn', '104throw_166thcolumn', '166throw_174thcolumn', '76throw_189thc