In [15]:
import torch
import random
import numpy as np
import pandas as pd
import scipy
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip '/content/drive/MyDrive/School/2025 Spring/Advanced ML/AML Project/Data/new/preprocessed_selected_features.zip'

Archive:  /content/drive/MyDrive/School/2025 Spring/Advanced ML/AML Project/Data/new/preprocessed_selected_features.zip
   creating: preprocessed_selected_features/
   creating: preprocessed_selected_features/test/
   creating: preprocessed_selected_features/train/
  inflating: preprocessed_selected_features/test/connectome_matrices.csv  
  inflating: preprocessed_selected_features/test/aux.csv  
  inflating: preprocessed_selected_features/train/labels.csv  
  inflating: preprocessed_selected_features/train/connectome_matrices.csv  
  inflating: preprocessed_selected_features/train/aux.csv  


In [4]:
seed = 42  # Choose any fixed number
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using CUDA

In [31]:
def compute_leaderboard_f1(y_true, y_pred):

    true_adhd = np.array(y_true)[:, 0]
    true_sex_f = np.array(y_true)[:, 1]
    pred_adhd = np.array(y_pred)[:, 0]
    pred_sex_f = np.array(y_pred)[:, 1]

    weights = np.where((true_adhd == 1) & (true_sex_f == 1), 2, 1)
    f1_adhd = f1_score(true_adhd, pred_adhd, sample_weight=weights, average='binary')
    f1_sex_f = f1_score(true_sex_f, pred_sex_f)
    leaderboard_score = (f1_adhd + f1_sex_f) / 2

    return leaderboard_score

In [22]:
preprocessed_data = "preprocessed_selected_features"
aux_file_name = "aux.csv"
connectome_matrices_file_name = "connectome_matrices.csv"

def get_feats(mode="train"):

    feats = pd.read_csv(f"{preprocessed_data}/{mode}/{aux_file_name}")
    conns = pd.read_csv(f"{preprocessed_data}/{mode}/{connectome_matrices_file_name}")
    feats = feats.merge(conns, on="participant_id", how="left")

    if mode == "train":
        labels = pd.read_csv(f"{preprocessed_data}/{mode}/labels.csv")
        feats = feats.merge(labels, on="participant_id", how="left")
        return feats, labels

    return feats


X, y = get_feats(mode="train")

In [23]:
X.head()

Unnamed: 0,participant_id,Barratt_Barratt_P1_Occ_20.0,Basic_Demos_Enroll_Year_2019,Barratt_Barratt_P1_Occ_35.0,Barratt_Barratt_P2_Occ_10.0,APQ_P_APQ_P_PM,Barratt_Barratt_P1_Edu_12.0,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Edu_9.0,Barratt_Barratt_P1_Edu_15.0,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,00aIpNTbG5uh,0,1,0,0,0.46875,0,1,0,0,...,-0.35847,0.046387,0.475778,0.279849,0.405671,0.440184,0.364975,0.596694,1,0
1,00fV0OyyoLfw,0,0,0,0,0.71875,0,1,0,0,...,-0.425571,-0.410904,0.626374,0.548932,0.517863,0.50105,0.347897,0.659732,1,0
2,04X1eiS79T4B,0,0,0,0,0.65625,0,1,0,0,...,-0.002727,-0.216852,0.763858,0.335067,0.587222,0.279215,0.640004,0.669921,0,1
3,05ocQutkURd6,0,0,0,0,0.46875,0,0,0,0,...,-0.255222,-0.267329,0.583788,0.34464,0.501296,0.559732,0.402026,0.494388,0,1
4,06YUNBA9ZRLq,0,0,0,0,0.21875,1,1,0,0,...,-0.18033,-0.003538,0.579396,0.387985,0.381801,0.483829,0.197092,0.418744,1,0


In [24]:
y.head()

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


In [25]:
X.set_index('participant_id',inplace=True)
y.set_index('participant_id',inplace=True)
y = y.reindex(X.index)
targets = ['ADHD_Outcome','Sex_F']
features = X.columns

In [26]:
X.head()

Unnamed: 0_level_0,Barratt_Barratt_P1_Occ_20.0,Basic_Demos_Enroll_Year_2019,Barratt_Barratt_P1_Occ_35.0,Barratt_Barratt_P2_Occ_10.0,APQ_P_APQ_P_PM,Barratt_Barratt_P1_Edu_12.0,Barratt_Barratt_P2_Edu_21.0,Barratt_Barratt_P2_Edu_9.0,Barratt_Barratt_P1_Edu_15.0,Barratt_Barratt_P1_Edu_18.0,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00aIpNTbG5uh,0,1,0,0,0.46875,0,1,0,0,0,...,-0.35847,0.046387,0.475778,0.279849,0.405671,0.440184,0.364975,0.596694,1,0
00fV0OyyoLfw,0,0,0,0,0.71875,0,1,0,0,0,...,-0.425571,-0.410904,0.626374,0.548932,0.517863,0.50105,0.347897,0.659732,1,0
04X1eiS79T4B,0,0,0,0,0.65625,0,1,0,0,0,...,-0.002727,-0.216852,0.763858,0.335067,0.587222,0.279215,0.640004,0.669921,0,1
05ocQutkURd6,0,0,0,0,0.46875,0,0,0,0,1,...,-0.255222,-0.267329,0.583788,0.34464,0.501296,0.559732,0.402026,0.494388,0,1
06YUNBA9ZRLq,0,0,0,0,0.21875,1,1,0,0,0,...,-0.18033,-0.003538,0.579396,0.387985,0.381801,0.483829,0.197092,0.418744,1,0


In [27]:
y.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
00aIpNTbG5uh,1,0
00fV0OyyoLfw,1,0
04X1eiS79T4B,0,1
05ocQutkURd6,0,1
06YUNBA9ZRLq,1,0


In [28]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## **Models**

### **Ridge Classifier**

In [34]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    model = MultiOutputClassifier(make_pipeline(RidgeClassifier(alpha=100)))
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")

Fold 1 F1 Score: 0.678251139523786
Fold 2 F1 Score: 0.6750358680057389
Fold 3 F1 Score: 0.6714004103471859
Fold 4 F1 Score: 0.6619306184012066
Fold 5 F1 Score: 0.6824432695734822
Mean F1 Score: 0.6738122611702799


### **Logistic Regression**

In [35]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    model = MultiOutputClassifier(make_pipeline(LogisticRegression(max_iter=10000)))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")


Fold 1 F1 Score: 0.6535688536409516
Fold 2 F1 Score: 0.6750447494033414
Fold 3 F1 Score: 0.6909854851031321
Fold 4 F1 Score: 0.683632960497517
Fold 5 F1 Score: 0.6868008948545862
Mean F1 Score: 0.6780065886999056


### **Random Forest**

In [36]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    rf_model = MultiOutputClassifier(make_pipeline(RandomForestClassifier(n_estimators=100, random_state=42)))
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")


Fold 1 F1 Score: 0.4596604495456719
Fold 2 F1 Score: 0.41866408934707905
Fold 3 F1 Score: 0.43258426966292135
Fold 4 F1 Score: 0.42769829771821805
Fold 5 F1 Score: 0.46825600273434165
Mean F1 Score: 0.4413726218016464


### **Kernel SVM**

In [37]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    svm_model = MultiOutputClassifier(make_pipeline(SVC(kernel='rbf', probability=True, random_state=42)  ))

    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")

Fold 1 F1 Score: 0.5058121857092738
Fold 2 F1 Score: 0.43209557636422347
Fold 3 F1 Score: 0.512315951525678
Fold 4 F1 Score: 0.4531353135313531
Fold 5 F1 Score: 0.49851809254180307
Mean F1 Score: 0.4803754239344663


## XGBoost

In [38]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    xgb_model = MultiOutputClassifier(make_pipeline(

        XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    ))

    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 1 F1 Score: 0.6267526397784318


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 2 F1 Score: 0.5503393665158371


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 3 F1 Score: 0.6287527459116427


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 4 F1 Score: 0.5344735077129443


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 5 F1 Score: 0.6254391722327324
Mean F1 Score: 0.5931514864303177
