In [58]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# ---------------------------
# 1. LOAD DATA
# ---------------------------
df_main = pd.read_excel("dataset/Complete_Dataframe_def.xlsx")
df_enum = pd.read_excel("dataset/Exact_enumeration_task.xlsx")
df_sym  = pd.read_excel("dataset/Symbolic_comparison_task.xlsx")
df_wm   = pd.read_excel("dataset/Visuo_spatial_WM_task.xlsx")


# ---------------------------
# 2. CLEAN SUBJECT IDS
# ---------------------------
def clean_sub(df):
    df["Sub"] = df["Sub"].astype(str).str.strip().str.upper()
    return df

df_main = clean_sub(df_main)
df_enum = clean_sub(df_enum)
df_sym  = clean_sub(df_sym)
df_wm   = clean_sub(df_wm)


# ---------------------------
# 3. SAFE NUMERIC CONVERSION
# ---------------------------
def convert_numeric_safe(df):
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype(str).str.replace(",", ".", regex=False)
            try:
                df[col] = pd.to_numeric(df[col])
            except:
                pass
    return df

df_enum = convert_numeric_safe(df_enum)
df_sym  = convert_numeric_safe(df_sym)
df_wm   = convert_numeric_safe(df_wm)

def fix_sub(df):
    df["Sub"] = df["Sub"].astype(str).str.strip()
    return df

df_main = fix_sub(df_main)
df_enum = fix_sub(df_enum)
df_sym  = fix_sub(df_sym)
df_wm   = fix_sub(df_wm)


# ---------------------------
# 4. AGGREGATE TASK DATA
# ---------------------------
def aggregate_task(df):
    num_cols = df.select_dtypes(include="number").columns.drop("Sub", errors="ignore")
    return df.groupby("Sub")[num_cols].mean().reset_index()

df_enum_agg = aggregate_task(df_enum)
df_sym_agg  = aggregate_task(df_sym)
df_wm_agg   = aggregate_task(df_wm)



# ---------------------------
# 5. MERGE (SAFE)
# ---------------------------
df = (
    df_main
    .merge(df_enum_agg, on="Sub", how="left")
    .merge(df_sym_agg,  on="Sub", how="left")
    .merge(df_wm_agg,   on="Sub", how="left")
)

print(df.shape)


# ---------------------------
# 6. LABEL ENCODING
# ---------------------------
df["group"] = df["group"].astype(str).str.strip().str.lower()
df["label"] = df["group"].map({"contr": 0, "dd": 1})

print(df["label"].value_counts())


# ---------------------------
# 7. FEATURES & TARGET
# ---------------------------
X = df.drop(columns=["Sub", "group", "label"])
y = df["label"]


# ---------------------------
# 8. PIPELINE + CV
# ---------------------------
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        random_state=42,
        class_weight="balanced"
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=cv, scoring="accuracy")

print("Fold scores:", scores)
print("Mean accuracy:", scores.mean())



(64, 31)
label
1    32
0    32
Name: count, dtype: int64
Fold scores: [0.69230769 0.84615385 0.84615385 0.53846154 0.75      ]
Mean accuracy: 0.7346153846153846
