In [12]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [13]:
df_main = pd.read_excel("dataset/Complete_Dataframe_def.xlsx")
df_enum = pd.read_excel("dataset/Exact_enumeration_task.xlsx")
df_sym  = pd.read_excel("dataset/Symbolic_comparison_task.xlsx")
df_wm   = pd.read_excel("dataset/Visuo_spatial_WM_task.xlsx")


SAFE NUMERIC CONVERSION

In [3]:
def convert_numeric_safe(df):
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype(str).str.replace(",", ".", regex=False)
            try:
                df[col] = pd.to_numeric(df[col])
            except (ValueError, TypeError):
                pass
    return df

df_enum = convert_numeric_safe(df_enum)
df_sym  = convert_numeric_safe(df_sym)
df_wm   = convert_numeric_safe(df_wm)

In [4]:
# --- Force Sub to string and clean it in ALL datasets ---
def clean_sub(df):
    df["Sub"] = (
        df["Sub"]
        .astype(str)
        .str.strip()
        .str.upper()
    )
    return df

df_main = clean_sub(df_main)
df_enum = clean_sub(df_enum)
df_sym  = clean_sub(df_sym)
df_wm   = clean_sub(df_wm)


AGGREGATE TASK DATA

In [5]:
def aggregate_task(df):
    numeric_cols = (
        df.select_dtypes(include="number")
          .columns
          .drop("Sub", errors="ignore")
    )
    return df.groupby("Sub")[numeric_cols].mean().reset_index()

df_enum_agg = aggregate_task(df_enum)
df_sym_agg  = aggregate_task(df_sym)
df_wm_agg   = aggregate_task(df_wm)

MERGE ALL DATA

In [6]:
# rebuild merged dataframe cleanly
df = (
    df_main
    .merge(df_enum_agg, on="Sub", how="left")
    .merge(df_sym_agg,  on="Sub", how="left")
    .merge(df_wm_agg,   on="Sub", how="left")
)

print(df.shape)
print(df["group"].value_counts(dropna=False))



(64, 31)
group
DD       32
contr    32
Name: count, dtype: int64


CLEAN LABEL

In [7]:
df["group"] = df["group"].astype(str).str.strip().str.lower()

df["label"] = df["group"].map({
    "contr": 0,
    "dd": 1
})

print(df["label"].value_counts())


label
1    32
0    32
Name: count, dtype: int64


In [8]:
X = df.drop(columns=["Sub", "group", "label"])
y = df["label"]

print(X.shape, y.shape)


(64, 29) (64,)


In [9]:
print("Students:", df.shape[0])
print("Max rows per student:", df["Sub"].value_counts().max())
print("Class distribution:\n", y.value_counts())

Students: 64
Max rows per student: 1
Class distribution:
 label
1    32
0    32
Name: count, dtype: int64


In [10]:
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ))
])


CROSS VALIDATION RANDOM FOREST

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=cv,
    scoring="accuracy"
)

print("Fold scores:", scores)
print("Mean accuracy:", scores.mean())

Fold scores: [0.69230769 0.84615385 0.84615385 0.53846154 0.75      ]
Mean accuracy: 0.7346153846153846


CROSS VALIDATION SVM 

In [17]:
from sklearn.svm import SVC

svm_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", SVC(
        kernel="rbf",
        C=1.0,
        gamma=0.01,
        class_weight="balanced",
        random_state=42
    ))
])

scores_svm = cross_val_score(
    svm_pipeline,
    X,
    y,
    cv=cv,
    scoring="accuracy"
)

print("SVM Fold scores:", scores_svm)
print("SVM Mean accuracy:", scores_svm.mean())


SVM Fold scores: [0.84615385 0.76923077 0.84615385 0.84615385 0.75      ]
SVM Mean accuracy: 0.8115384615384617


PRECISION, RECALL, F1

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# ---------------------------
# FINAL TRAINâ€“TEST SPLIT
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Train final tuned SVM
svm_pipeline.fit(X_train, y_train)

# Predictions
y_pred = svm_pipeline.predict(X_test)

# ---------------------------
# METRICS
# ---------------------------
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Control", "Dyscalculia"]))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

     Control       0.80      0.57      0.67         7
 Dyscalculia       0.62      0.83      0.71         6

    accuracy                           0.69        13
   macro avg       0.71      0.70      0.69        13
weighted avg       0.72      0.69      0.69        13

Confusion Matrix:

[[4 3]
 [1 5]]
