In [27]:
import torch
import random
import numpy as np
import pandas as pd
import scipy
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt


In [28]:
seed = 42 
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  

In [29]:
def compute_leaderboard_f1(y_true, y_pred):

    true_adhd = np.array(y_true)[:, 0]
    true_sex_f = np.array(y_true)[:, 1]
    pred_adhd = np.array(y_pred)[:, 0]
    pred_sex_f = np.array(y_pred)[:, 1]

    weights = np.where((true_adhd == 1) & (true_sex_f == 1), 2, 1)
    f1_adhd = f1_score(true_adhd, pred_adhd, sample_weight=weights, average='binary')
    f1_sex_f = f1_score(true_sex_f, pred_sex_f)
    leaderboard_score = (f1_adhd + f1_sex_f) / 2

    return leaderboard_score

In [32]:
preprocessed_data = "../../../widsdatathon2025/Preprocessed/preprocessed_selected_features 2"
aux_file_name = "aux.csv"
connectome_matrices_file_name = "connectome_matrices.csv"

def get_feats(mode="train"):

    feats = pd.read_csv(f"{preprocessed_data}/{mode}/{aux_file_name}")
    conns = pd.read_csv(f"{preprocessed_data}/{mode}/{connectome_matrices_file_name}")
    feats = feats.merge(conns, on="participant_id", how="left")

    if mode == "train":
        labels = pd.read_csv(f"{preprocessed_data}/{mode}/labels.csv")
        feats = feats.merge(labels, on="participant_id", how="left")
        return feats, labels

    return feats


X, y = get_feats(mode="train")

In [41]:
X.set_index('participant_id',inplace=True)
y.set_index('participant_id',inplace=True)
y = y.reindex(X.index)
targets = ['ADHD_Outcome','Sex_F']
features = X.columns

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## **Models**

### **Random Forest**

In [17]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    rf_model = MultiOutputClassifier(make_pipeline(RandomForestClassifier(n_estimators=100, random_state=42)))
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")


Fold 1 F1 Score: 0.458464118845094
Fold 2 F1 Score: 0.40824742268041236
Fold 3 F1 Score: 0.43045112781954886
Fold 4 F1 Score: 0.42769829771821805
Fold 5 F1 Score: 0.47033196828439483
Mean F1 Score: 0.43903858706953364


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
from sklearn.model_selection import KFold

param_grid = {
    'estimator__randomforestclassifier__n_estimators': [100, 150, 200],
    'estimator__randomforestclassifier__max_depth': [None, 10, 20, 30],
    'estimator__randomforestclassifier__min_samples_split': [2, 5, 10],
    'estimator__randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'estimator__randomforestclassifier__bootstrap': [True, False]
}

f1_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)  

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    
    rf_model = MultiOutputClassifier(make_pipeline(RandomForestClassifier(random_state=42)))
    
    grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_rf_model = grid_search.best_estimator_
    
    y_pred = best_rf_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)  
    f1_scores.append(f1)
    
    print(f"Fold {fold+1} F1 Score: {f1}")
    print(f"Best Parameters: {grid_search.best_params_}")

print(f"Mean F1 Score: {np.mean(f1_scores)}")


Fold 1 F1 Score: 0.43937000098745926
Best Parameters: {'estimator__randomforestclassifier__bootstrap': True, 'estimator__randomforestclassifier__max_depth': 20, 'estimator__randomforestclassifier__min_samples_leaf': 1, 'estimator__randomforestclassifier__min_samples_split': 5, 'estimator__randomforestclassifier__n_estimators': 100}
Fold 2 F1 Score: 0.42193930041152267
Best Parameters: {'estimator__randomforestclassifier__bootstrap': False, 'estimator__randomforestclassifier__max_depth': 20, 'estimator__randomforestclassifier__min_samples_leaf': 1, 'estimator__randomforestclassifier__min_samples_split': 2, 'estimator__randomforestclassifier__n_estimators': 100}
Fold 3 F1 Score: 0.5055158987670344
Best Parameters: {'estimator__randomforestclassifier__bootstrap': False, 'estimator__randomforestclassifier__max_depth': None, 'estimator__randomforestclassifier__min_samples_leaf': 2, 'estimator__randomforestclassifier__min_samples_split': 2, 'estimator__randomforestclassifier__n_estimators': 

### **Kernel SVM**

In [None]:
f1_scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    svm_model = MultiOutputClassifier(make_pipeline(SVC(kernel='rbf', probability=True, random_state=42)  ))

    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")

Fold 1 F1 Score: 0.5058121857092738
Fold 2 F1 Score: 0.43209557636422347
Fold 3 F1 Score: 0.512315951525678


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
from sklearn.model_selection import KFold

param_grid = {
    'estimator__svc__C': [0.1, 1, 10, 100],
    'estimator__svc__gamma': ['scale', 'auto', 0.1, 0.01],
    'estimator__svc__kernel': ['rbf', 'linear', 'poly'],
    'estimator__svc__degree': [3, 4, 5],  
    'estimator__svc__class_weight': [None, 'balanced']  
}

f1_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)  

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index].drop(targets, axis=1), X.iloc[test_index].drop(targets, axis=1)
    y_train, y_test = y.iloc[train_index][targets], y.iloc[test_index][targets]
    
    svm_model = MultiOutputClassifier(make_pipeline(SVC(random_state=42)))
    
    grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_svm_model = grid_search.best_estimator_
    
    y_pred = best_svm_model.predict(X_test)
    f1 = compute_leaderboard_f1(y_test, y_pred)  
    f1_scores.append(f1)
    
    print(f"Fold {fold+1} F1 Score: {f1}")
    print(f"Best Parameters: {grid_search.best_params_}")

print(f"Mean F1 Score: {np.mean(f1_scores)}")




Fold 1 F1 Score: 0.6701798201798201
Best Parameters: {'estimator__svc__C': 0.1, 'estimator__svc__class_weight': None, 'estimator__svc__degree': 3, 'estimator__svc__gamma': 'scale', 'estimator__svc__kernel': 'linear'}




Fold 2 F1 Score: 0.6840336134453782
Best Parameters: {'estimator__svc__C': 10, 'estimator__svc__class_weight': 'balanced', 'estimator__svc__degree': 3, 'estimator__svc__gamma': 'auto', 'estimator__svc__kernel': 'rbf'}
Fold 3 F1 Score: 0.6800892030631995
Best Parameters: {'estimator__svc__C': 100, 'estimator__svc__class_weight': None, 'estimator__svc__degree': 3, 'estimator__svc__gamma': 'auto', 'estimator__svc__kernel': 'rbf'}




KeyboardInterrupt: 

In [23]:
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
import numpy as np

fixed_params = dict(
    C=100,
    gamma='auto',
    kernel='rbf',
    degree=3,
    class_weight=None,
    random_state=42
)

f1_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = (
        X.iloc[train_idx].drop(targets, axis=1),
        X.iloc[test_idx].drop(targets, axis=1)
    )
    y_train, y_test = y.iloc[train_idx][targets], y.iloc[test_idx][targets]

    base_svc = make_pipeline(SVC(**fixed_params))
    model = MultiOutputClassifier(base_svc)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    f1 = compute_leaderboard_f1(y_test, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold} F1 Score: {f1:.4f}")

print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")


Fold 1 F1 Score: 0.6600
Fold 2 F1 Score: 0.6658
Fold 3 F1 Score: 0.6801
Fold 4 F1 Score: 0.6710
Fold 5 F1 Score: 0.6957
Mean F1 Score: 0.6745


In [34]:
X_test_data = get_feats(mode="test")

In [35]:
X_test_data

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year_2016,PreInt_Demos_Fam_Child_Ethnicity_2.0,Barratt_Barratt_P2_Occ_25.0,SDQ_SDQ_Prosocial,PreInt_Demos_Fam_Child_Race_4.0,PreInt_Demos_Fam_Child_Ethnicity_1.0,Barratt_Barratt_P2_Occ_20.0,Basic_Demos_Enroll_Year_2017,Barratt_Barratt_P1_Edu_18.0,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,Cfwaf5FX7jWK,0,0,0,0.8,0,0,0,0,0,...,0.109113,-0.080147,-0.112746,-0.035289,0.500307,0.404156,-0.015204,0.728375,0.623918,0.615181
1,vhGrzmvA3Hjq,0,0,0,0.9,0,0,0,0,0,...,0.268648,-0.001063,0.106298,0.040809,0.773208,0.353462,0.865709,0.505250,0.622880,0.387470
2,ULliyEXjy4OV,0,0,0,0.9,0,0,0,0,0,...,0.069627,0.034699,-0.072661,0.144986,0.647908,0.597701,0.635925,0.661952,0.678780,0.831323
3,LZfeAb1xMtql,0,0,0,0.6,0,0,0,0,0,...,0.062658,-0.350891,0.155848,-0.322303,0.727458,0.686660,0.733794,0.651472,0.760400,0.600308
4,EnFOUv0YK1RG,0,1,0,1.0,0,0,0,0,1,...,0.428369,0.003280,0.371870,0.425030,0.540083,0.623086,0.539733,0.320658,0.165316,0.597570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,UadZfjdEg7eG,0,0,0,0.7,0,0,0,0,1,...,0.029203,-0.239101,0.134974,0.035784,0.648014,0.436724,0.481763,0.608427,0.546305,0.629582
300,IUEHiLmQAqCi,0,0,0,0.8,0,0,0,0,1,...,0.061018,-0.114604,-0.151247,0.044140,0.719873,0.594544,0.249114,0.586420,0.354516,0.338287
301,cRySmCadYFRO,0,0,0,1.0,0,0,0,0,0,...,0.197517,0.006541,0.172096,-0.188897,0.880549,0.851316,0.716680,0.803551,0.757941,0.751606
302,E3MvDUtJadc5,0,0,0,0.0,0,0,0,0,0,...,0.049072,-0.507423,-0.211364,-0.428595,0.738937,0.681606,0.114101,0.795757,0.478256,0.586959


In [None]:
X_test_data.set_index('participant_id',inplace=True)

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier


train_df = pd.concat([X, y[targets]], axis=1).dropna()  
X_clean  = train_df.drop(columns=targets)
y_clean  = train_df[targets]

X_test_clean = X_test_data.dropna()

best_params = {
    "C": 100,
    "gamma": "auto",
    "kernel": "rbf",
    "degree": 3,
    "class_weight": None,
    "random_state": 42,
}


kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_clean), 1):
    X_train = X_clean.iloc[train_idx]
    X_val   = X_clean.iloc[val_idx]
    y_train = y_clean.iloc[train_idx]
    y_val   = y_clean.iloc[val_idx]

    cv_model = MultiOutputClassifier(make_pipeline(SVC(**best_params)))
    cv_model.fit(X_train, y_train)

    y_pred = cv_model.predict(X_val)
    f1     = compute_leaderboard_f1(y_val, y_pred)
    f1_scores.append(f1)
    print(f"Fold {fold} F1 Score: {f1:.4f}")

print(f"Mean CV F1 Score: {np.mean(f1_scores):.4f}")
X_test_clean = X_test_data.dropna()


final_model = MultiOutputClassifier(make_pipeline(SVC(**best_params)))
final_model.fit(X_clean, y_clean)




Fold 1 F1 Score: 0.7828
Fold 2 F1 Score: 0.7823
Fold 3 F1 Score: 0.8048
Fold 4 F1 Score: 0.7577
Fold 5 F1 Score: 0.8073
Mean CV F1 Score: 0.7870


In [59]:
y_clean

Unnamed: 0_level_0,ADHD_Outcome,ADHD_Outcome,Sex_F,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00aIpNTbG5uh,1,1,0,0
00fV0OyyoLfw,1,1,0,0
04X1eiS79T4B,0,0,1,1
05ocQutkURd6,0,0,1,1
06YUNBA9ZRLq,1,1,0,0
...,...,...,...,...
zwjJWCRzKhDz,0,0,1,1
zwXD5v17Rx01,1,1,0,0
zWzLCi3NTBTd,1,1,1,1
Zy9GTHDxUbXU,1,1,0,0


In [63]:
test_pred      = final_model.predict(X_test_clean)


In [64]:
test_pred

array([[1, 1, 0, 0],
       [1, 1, 0, 0],
       [1, 1, 0, 0],
       ...,
       [0, 0, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [67]:
import numpy as np
import pandas as pd

# ----------------------------------------------------------
# test_pred : numpy array returned by final_model.predict()
# X_test_clean : DataFrame you passed to predict(), keep its index
# ----------------------------------------------------------

# find the set of distinct prediction columns
# (works even if duplicates are not contiguous)
_, unique_idx = np.unique(test_pred.T, axis=0, return_index=True)
unique_idx = sorted(unique_idx)         # keep left‑to‑right order

# slice out the non‑duplicate columns
pred_unique = test_pred[:, unique_idx]   # shape = (n_samples, 2)

# build the final DataFrame
pred_df = pd.DataFrame(
    pred_unique,
    index=X_test_clean.index,
    columns=["ADHD_Outcome", "Sex_F"]    # desired column names
)

print(pred_df.head())


                ADHD_Outcome  Sex_F
participant_id                     
Cfwaf5FX7jWK               1      0
vhGrzmvA3Hjq               1      0
ULliyEXjy4OV               1      0
LZfeAb1xMtql               1      1
EnFOUv0YK1RG               1      0


In [68]:
pred_df

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Cfwaf5FX7jWK,1,0
vhGrzmvA3Hjq,1,0
ULliyEXjy4OV,1,0
LZfeAb1xMtql,1,1
EnFOUv0YK1RG,1,0
...,...,...
UadZfjdEg7eG,1,0
IUEHiLmQAqCi,1,0
cRySmCadYFRO,0,1
E3MvDUtJadc5,0,0


In [None]:
pred_df.to_csv("predictions.csv", index=True)      # index=True keeps the IDs