In [1]:
import torch
import random
import numpy as np
import pandas as pd
import scipy
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [2]:
seed = 42  # Choose any fixed number
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using CUDA

In [3]:
def compute_leaderboard_f1(y_true, y_pred):

    true_adhd = np.array(y_true)[:, 0]
    true_sex_f = np.array(y_true)[:, 1]
    pred_adhd = np.array(y_pred)[:, 0]
    pred_sex_f = np.array(y_pred)[:, 1]

    weights = np.where((true_adhd == 1) & (true_sex_f == 1), 2, 1)
    f1_adhd = f1_score(true_adhd, pred_adhd, sample_weight=weights, average='binary')
    f1_sex_f = f1_score(true_sex_f, pred_sex_f)
    leaderboard_score = (f1_adhd + f1_sex_f) / 2

    return leaderboard_score

In [4]:
preprocessed_data = "/Users/reza/School/2025/1-Spring/Advanced ML/Project/data/new/preprocessed_selected_features"
aux_file_name = "aux.csv"
connectome_matrices_file_name = "connectome_matrices.csv"

def get_feats(mode="train"):

    feats = pd.read_csv(f"{preprocessed_data}/{mode}/{aux_file_name}")
    conns = pd.read_csv(f"{preprocessed_data}/{mode}/{connectome_matrices_file_name}")
    feats = feats.merge(conns, on="participant_id", how="left")
    feats.set_index("participant_id", inplace=True)

    if mode == "train":
        labels = pd.read_csv(f"{preprocessed_data}/{mode}/labels.csv", index_col=0)
        labels = labels.reindex(feats.index)
        return feats, labels

    return feats


X, y = get_feats(mode="train")

In [5]:
X.head()

Unnamed: 0_level_0,Basic_Demos_Enroll_Year_2016,PreInt_Demos_Fam_Child_Ethnicity_2.0,Barratt_Barratt_P2_Occ_25.0,SDQ_SDQ_Prosocial,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Ethnicity_1.0,Barratt_Barratt_P2_Occ_20.0,Basic_Demos_Enroll_Year_2017,Barratt_Barratt_P1_Edu_18.0,Barratt_Barratt_P2_Edu_15.0,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00aIpNTbG5uh,0,0,0,0.9,0,1,0,0,0,0,...,-0.064414,-0.141627,-0.35847,0.046387,0.475778,0.279849,0.405671,0.440184,0.364975,0.596694
00fV0OyyoLfw,0,0,0,0.8,0,0,0,1,0,0,...,-0.194309,-0.405958,-0.425571,-0.410904,0.626374,0.548932,0.517863,0.50105,0.347897,0.659732
04X1eiS79T4B,0,0,0,0.7,0,1,0,1,0,0,...,-0.181503,-0.15945,-0.002727,-0.216852,0.763858,0.335067,0.587222,0.279215,0.640004,0.669921
05ocQutkURd6,0,0,0,0.6,0,0,0,0,1,0,...,-0.131125,-0.060821,-0.255222,-0.267329,0.583788,0.34464,0.501296,0.559732,0.402026,0.494388
06YUNBA9ZRLq,0,0,0,0.4,0,0,0,0,0,0,...,-0.215715,-0.402365,-0.18033,-0.003538,0.579396,0.387985,0.381801,0.483829,0.197092,0.418744


In [6]:
X.to_csv(f"{preprocessed_data}/train/merged.csv")

In [31]:
y.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
00aIpNTbG5uh,1,0
00fV0OyyoLfw,1,0
04X1eiS79T4B,0,1
05ocQutkURd6,0,1
06YUNBA9ZRLq,1,0


In [32]:
targets = ['ADHD_Outcome','Sex_F']
features = X.columns

In [33]:
X.head()

Unnamed: 0_level_0,Basic_Demos_Enroll_Year_2016,PreInt_Demos_Fam_Child_Ethnicity_2.0,Barratt_Barratt_P2_Occ_25.0,SDQ_SDQ_Prosocial,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Ethnicity_1.0,Barratt_Barratt_P2_Occ_20.0,Basic_Demos_Enroll_Year_2017,Barratt_Barratt_P1_Edu_18.0,Barratt_Barratt_P2_Edu_15.0,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00aIpNTbG5uh,0,0,0,0.9,0,1,0,0,0,0,...,-0.064414,-0.141627,-0.35847,0.046387,0.475778,0.279849,0.405671,0.440184,0.364975,0.596694
00fV0OyyoLfw,0,0,0,0.8,0,0,0,1,0,0,...,-0.194309,-0.405958,-0.425571,-0.410904,0.626374,0.548932,0.517863,0.50105,0.347897,0.659732
04X1eiS79T4B,0,0,0,0.7,0,1,0,1,0,0,...,-0.181503,-0.15945,-0.002727,-0.216852,0.763858,0.335067,0.587222,0.279215,0.640004,0.669921
05ocQutkURd6,0,0,0,0.6,0,0,0,0,1,0,...,-0.131125,-0.060821,-0.255222,-0.267329,0.583788,0.34464,0.501296,0.559732,0.402026,0.494388
06YUNBA9ZRLq,0,0,0,0.4,0,0,0,0,0,0,...,-0.215715,-0.402365,-0.18033,-0.003538,0.579396,0.387985,0.381801,0.483829,0.197092,0.418744


In [34]:
y.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
00aIpNTbG5uh,1,0
00fV0OyyoLfw,1,0
04X1eiS79T4B,0,1
05ocQutkURd6,0,1
06YUNBA9ZRLq,1,0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

## **Models**

### **Ridge Classifier**

In [37]:
model = MultiOutputClassifier(make_pipeline(RidgeClassifier()))
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
f1 = compute_leaderboard_f1(y_test, y_pred)
print(f"F1 Score: {f1}")

F1 Score: 0.6672173550271172


### **Logistic Regression**

In [38]:
model = MultiOutputClassifier(make_pipeline(LogisticRegression()))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = compute_leaderboard_f1(y_test, y_pred)
print(f"F1 Score: {f1}")


F1 Score: 0.6535688536409516


### **Random Forest**

In [39]:
model = MultiOutputClassifier(make_pipeline(RandomForestClassifier(random_state=42)))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = compute_leaderboard_f1(y_test, y_pred)
print(f"F1 Score: {f1}")


F1 Score: 0.4496386630532972


### **Kernel SVM**

In [40]:
model = MultiOutputClassifier(
    make_pipeline(SVC(kernel="rbf", probability=True, random_state=42))
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1 = compute_leaderboard_f1(y_test, y_pred)
print(f"F1 Score: {f1}")

F1 Score: 0.5058121857092738


## XGBoost

In [41]:
model = MultiOutputClassifier(
    make_pipeline(
        XGBClassifier(eval_metric="logloss", random_state=42)
    )
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
f1 = compute_leaderboard_f1(y_test, y_pred)
print(f"F1 Score: {f1}")

F1 Score: 0.6238644304682041


## Best ones for kaggle

In [42]:
X_test = get_feats(mode="test")

In [45]:
X_test.head()

Unnamed: 0_level_0,Basic_Demos_Enroll_Year_2016,PreInt_Demos_Fam_Child_Ethnicity_2.0,Barratt_Barratt_P2_Occ_25.0,SDQ_SDQ_Prosocial,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Ethnicity_1.0,Barratt_Barratt_P2_Occ_20.0,Basic_Demos_Enroll_Year_2017,Barratt_Barratt_P1_Edu_18.0,Barratt_Barratt_P2_Edu_15.0,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cfwaf5FX7jWK,0,0,0,0.8,0,0,0,0,0,0,...,0.109113,-0.080147,-0.112746,-0.035289,0.500307,0.404156,-0.015204,0.728375,0.623918,0.615181
vhGrzmvA3Hjq,0,0,0,0.9,0,0,0,0,0,0,...,0.268648,-0.001063,0.106298,0.040809,0.773208,0.353462,0.865709,0.50525,0.62288,0.38747
ULliyEXjy4OV,0,0,0,0.9,0,0,0,0,0,0,...,0.069627,0.034699,-0.072661,0.144986,0.647908,0.597701,0.635925,0.661952,0.67878,0.831323
LZfeAb1xMtql,0,0,0,0.6,0,0,0,0,0,0,...,0.062658,-0.350891,0.155848,-0.322303,0.727458,0.68666,0.733794,0.651472,0.7604,0.600308
EnFOUv0YK1RG,0,1,0,1.0,0,0,0,0,1,0,...,0.428369,0.00328,0.37187,0.42503,0.540083,0.623086,0.539733,0.320658,0.165316,0.59757


### Ridge

In [47]:
model = MultiOutputClassifier(make_pipeline(RidgeClassifier()))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
df = pd.DataFrame(y_pred, columns=["ADHD_Outcome", "Sex_F"])

# Assign participant IDs (e.g., from test_X indices)
df["participant_id"] = X_test.index.values

# Optional: set participant_id as the index
df.set_index("participant_id", inplace=True)

df.to_csv("pred_ridge.csv")