In [147]:
import os
import numpy as np
import pandas as pd
import scipy
import umap.umap_ as umap
import seaborn as sns
import matplotlib.pyplot as plt
from gc import collect
from matplotlib.ticker import MaxNLocator
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import ExtraTreesRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFECV, SequentialFeatureSelector, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.kernel_approximation import Nystroem
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import clone
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, make_scorer, r2_score, accuracy_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler


In [148]:
preprocessed_data = "../widsdatathon2025/Preprocessed/preprocessed_selected_features"
aux_file_name = "aux.csv"
connectome_matrices_file_name = "connectome_matrices.csv"

def get_feats(mode="train"):

    feats = pd.read_csv(f"{preprocessed_data}/{mode}/{aux_file_name}")
    # conns = pd.read_csv(f"{preprocessed_data}/{mode}/{connectome_matrices_file_name}")
    # feats = feats.merge(conns, on="participant_id", how="left")

    if mode == "train":
        labels = pd.read_csv(f"{preprocessed_data}/{mode}/labels.csv")
        feats = feats.merge(labels, on="participant_id", how="left")
        return feats, labels

    return feats


train,y = get_feats(mode="train")
test = get_feats(mode="test")

In [149]:
train.set_index('participant_id',inplace=True)
test.set_index('participant_id',inplace=True)
targets = ['ADHD_Outcome','Sex_F']
features = test.columns

In [150]:
def check_for_nulls(df):
  if df.isnull().any().any():
    print("The DataFrame contains null values.")
  else:
    print("The DataFrame does not contain null values.")

In [151]:
check_for_nulls(train)
check_for_nulls(test)
print(f'Train: {train.shape}, Test: {test.shape}')

The DataFrame does not contain null values.
The DataFrame does not contain null values.
Train: (1213, 65), Test: (1213, 63)


In [152]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(targets,axis=1), 
                                                    y[targets], 
                                                    test_size=0.2, 
                                                    random_state=42)
log_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]


## **Models**

### **Ridge Classifier**

In [None]:
model = MultiOutputClassifier(make_pipeline(RidgeClassifier(alpha=100)))
model.fit(X_train,y_train)
y_pred = model.predict(X_test)



sensitivity = recall_score(y_test, y_pred, average=None)  

specificities = []
for i in range(y_test.shape[1]):
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])  
    if cm.shape == (2, 2):  
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  
    else:
        specificity = 0 
    specificities.append(specificity)

# Print Results
print('\nRidge Classifier Results:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall (Sensitivity):', sensitivity.mean())  # Mean sensitivity across labels
print('Specificity:', np.mean(specificities))  # Mean specificity across labels
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))


Ridge Classifier Results:
Accuracy: 0.6008230452674898
Recall (Sensitivity): 0.48447253433208487
Specificity: 0.7278452541610436
F1 Score: 0.7483588621444202


### **Logistic Regression**

In [None]:
model = MultiOutputClassifier(make_pipeline(LogisticRegression(max_iter=10000)))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


sensitivity = recall_score(y_test, y_pred, average=None)  

specificities = []
for i in range(y_test.shape[1]):  
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])  
    if cm.shape == (2, 2): 
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  
    else:
        specificity = 0 
    specificities.append(specificity)

# Print Results
print('\nLogistic Regression Results:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall (Sensitivity):', sensitivity.mean()) 
print('Specificity:', np.mean(specificities)) 
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))



Logistic Regression Results:
Accuracy: 0.5843621399176955
Recall (Sensitivity): 0.6271847690387016
Specificity: 0.7199730094466936
F1 Score: 0.7551020408163265


### **Random Forest**

In [None]:
rf_model = MultiOutputClassifier(make_pipeline(RandomForestClassifier(n_estimators=100, random_state=42))) 

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)


sensitivity = recall_score(y_test, y_pred, average=None)  

specificities = []
for i in range(y_test.shape[1]):  
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])  
    if cm.shape == (2, 2):  
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0 
    else:
        specificity = 0  
    specificities.append(specificity)

# Print Results
print('\nRandom Forest Results:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall (Sensitivity):', sensitivity.mean())  
print('Specificity:', np.mean(specificities))  
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))



Random Forest Results:
Accuracy: 0.5761316872427984
Recall (Sensitivity): 0.5021847690387016
Specificity: 0.7506072874493928
F1 Score: 0.734065934065934


### **Kernel SVM**

In [None]:
svm_model = MultiOutputClassifier(make_pipeline(SVC(kernel='rbf', probability=True, random_state=42)  ))

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

sensitivity = recall_score(y_test, y_pred, average=None) 

specificities = []
for i in range(y_test.shape[1]):  
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])  
    if cm.shape == (2, 2):  
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0 
    else:
        specificity = 0  
    specificities.append(specificity)

print('\nKernel SVM Results:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall (Sensitivity):', sensitivity.mean()) 
print('Specificity:', np.mean(specificities))  
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))



Kernel SVM Results:
Accuracy: 0.6008230452674898
Recall (Sensitivity): 0.47042759051186017
Specificity: 0.7970760233918128
F1 Score: 0.7494356659142212


In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

xgb_model = MultiOutputClassifier(make_pipeline(

    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
))

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

sensitivity = recall_score(y_test, y_pred, average=None)

specificities = []
for i in range(y_test.shape[1]):  
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])  
    if cm.shape == (2, 2):  
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0 
    else:
        specificity = 0  
    specificities.append(specificity)

# Print evaluation metrics
print('\nXGBoost Results:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall (Sensitivity):', sensitivity.mean()) 
print('Specificity:', np.mean(specificities)) 
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))



XGBoost Results:
Accuracy: 0.5102880658436214
Recall (Sensitivity): 0.6090043695380774
Specificity: 0.6591992802519118
F1 Score: 0.7258064516129032


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [170]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score
import pandas as pd
import numpy as np

preprocessed_data = "../widsdatathon2025/Preprocessed/preprocessed_selected_features"
aux_file_name = "aux.csv"
labels_file_name = "labels.csv"

# Load data
def get_feats(mode="train"):
    feats = pd.read_csv(f"{preprocessed_data}/{mode}/{aux_file_name}")
    if mode == "train":
        labels = pd.read_csv(f"{preprocessed_data}/{mode}/{labels_file_name}")
        feats = feats.merge(labels, on="participant_id", how="left")
        return feats
    return feats

train = get_feats(mode="train")

# Define targets explicitly
targets = ['ADHD_Outcome', 'Sex_F']

# Prepare features and labels
X = train.drop(['participant_id'] + targets, axis=1).values
y = train[targets].values

# Define neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, layer_dims, output_dim1, output_dim2):
        super(NeuralNetwork, self).__init__()
        layers = []
        prev_dim = input_dim
        for dim in layer_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            prev_dim = dim
        self.shared_layers = nn.Sequential(*layers)
        self.output1 = nn.Linear(prev_dim, output_dim1)
        self.output2 = nn.Linear(prev_dim, output_dim2)
        
    def forward(self, x):
        shared_output = self.shared_layers(x)
        return self.output1(shared_output), self.output2(shared_output)

# Parameters
layer_dims = [128 ,64 , 32]
input_dim = X.shape[1]
output_dims = [len(np.unique(y[:, 0])), len(np.unique(y[:, 1]))]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_metrics = {'accuracy': [], 'recall': [], 'f1': []}

# K-Fold Cross-Validation
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Data normalization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor_1 = torch.tensor(y_train[:, 0], dtype=torch.long)
    y_train_tensor_2 = torch.tensor(y_train[:, 1], dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor_1 = torch.tensor(y_test[:, 0], dtype=torch.long)
    y_test_tensor_2 = torch.tensor(y_test[:, 1], dtype=torch.long)

    # Initialize model, criterion, and optimizer
    model = NeuralNetwork(input_dim, layer_dims, output_dims[0], output_dims[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    epochs = 50
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs1, outputs2 = model(X_train_tensor)
        loss = criterion(outputs1, y_train_tensor_1) + criterion(outputs2, y_train_tensor_2)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        pred1, pred2 = model(X_test_tensor)
        pred_classes1 = torch.argmax(pred1, axis=1)
        pred_classes2 = torch.argmax(pred2, axis=1)

    accuracy = (accuracy_score(y_test_tensor_1, pred_classes1) + accuracy_score(y_test_tensor_2, pred_classes2)) / 2
    recall = (recall_score(y_test_tensor_1, pred_classes1, average='weighted') + recall_score(y_test_tensor_2, pred_classes2, average='weighted')) / 2
    f1 = (f1_score(y_test_tensor_1, pred_classes1, average='weighted') + f1_score(y_test_tensor_2, pred_classes2, average='weighted')) / 2

    fold_metrics['accuracy'].append(accuracy)
    fold_metrics['recall'].append(recall)
    fold_metrics['f1'].append(f1)

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Recall: {recall * 100:.2f}%')
    print(f'F1 Score: {f1 * 100:.2f}%')

# Cross-validation results
print("\nCross-validation results:")
print(f'Mean Accuracy: {np.mean(fold_metrics["accuracy"]) * 100:.2f}%')
print(f'Mean Recall: {np.mean(fold_metrics["recall"]) * 100:.2f}%')
print(f'Mean F1 Score: {np.mean(fold_metrics["f1"]) * 100:.2f}%')


Fold 1
Accuracy: 74.69%
Recall: 74.69%
F1 Score: 69.39%
Fold 2
Accuracy: 67.90%
Recall: 67.90%
F1 Score: 60.66%
Fold 3
Accuracy: 73.87%
Recall: 73.87%
F1 Score: 72.50%
Fold 4
Accuracy: 70.25%
Recall: 70.25%
F1 Score: 63.09%
Fold 5
Accuracy: 72.73%
Recall: 72.73%
F1 Score: 66.83%

Cross-validation results:
Mean Accuracy: 71.89%
Mean Recall: 71.89%
Mean F1 Score: 66.49%
