## Pipeline fMRI dataset for ML

In [None]:
# LOAD CORE LIBRARIES (FMRI + ML + DATA
# Neuro / ML / manejo de datos
import nilearn
import sklearn
import pandas as pd

# Utilidades del sistema de archivos
import os
import glob

print("Nilearn:", nilearn.__version__)
print("OK, todo carg√≥")

In [None]:
# SET DATASET ROOT DIRECTORY

base_path = "/Users/antoniaolgui/Library/Mobile Documents/com~apple~CloudDocs/Desktop/ds101_R2.0.0/"

# List folder contents (should show sub-01, sub-02, ...)
os.listdir(base_path)

#Get sorted list of all subject in the dataset
subjects = sorted([d for d in os.listdir(base_path) if d.startswith("sub-")])
subjects

In [None]:
# CHECK UNIQUE trial_type VALUES ACROSS ALL EVENT FILES
#    (This reveals how congruency and correctness are encoded)

pattern = os.path.join(base_path, "sub-*", "func", "*_events.tsv")
files = sorted(glob.glob(pattern))

trial_types = set()

for f in files:
    df_tmp = pd.read_csv(f, sep="\t")
    trial_types.update(df_tmp["trial_type"].unique())

trial_types

In [None]:
# BUILD MASTER BEHAVIORAL DATAFRAME
#    - Reads all *_events.tsv files from all subjects
#    - Extracts congruency (congruent / incongruent)
#    - Extracts accuracy (correct / incorrect)
#    - Creates binary ML label (0 = congruent, 1 = incongruent)

pattern = os.path.join(base_path, "sub-*", "func", "*_events.tsv")
files = sorted(glob.glob(pattern))

print("Number of event files found:", len(files))

all_rows = []

for f in files:
    df = pd.read_csv(f, sep="\t")
    
    # Extract subject ID and run filename
    subject = f.split("/")[-3]
    run = f.split("/")[-1]
    
    df["subject"] = subject
    df["run"] = run
    
    # Add continuous trial number within each run
    df["trial"] = range(1, len(df) + 1)
    
    # trial_type format: "congruent_correct", "incongruent_incorrect"
    df["congruency"] = df["trial_type"].str.split("_").str[0]
    df["accuracy_label"] = df["trial_type"].str.split("_").str[1]

    # Binary label for ML
    df["label"] = df["congruency"].map({"congruent": 0, "incongruent": 1})

    # Keep only relevant columns
    df_clean = df[[
        "subject",
        "run",
        "trial",
        "trial_type",
        "congruency",
        "accuracy_label",
        "correctness",
        "StimVar",
        "behav_unlabeled",
        "Rsponse",
        "Stimulus",
        "cond",
        "label"
    ]]
    
    all_rows.append(df_clean)

behaviour = pd.concat(all_rows, ignore_index=True)

behaviour.head(20)

In [None]:
# SAVE CLEAN DATASET + BASIC DESCRIPTIVE STATS

behaviour.to_csv("simon_behaviour_clean.csv", index=False)

print("DataFrame shape:", behaviour.shape)

print("\nTrials per congruency:")
print(behaviour["congruency"].value_counts())

print("\nTrials per subject:")
print(behaviour["subject"].value_counts())

### Summarized tables

In [None]:
#Count of trials per subject

behaviour[behaviour["subject"] == "sub-01"]

In [None]:
#Per-subject summary including congruency distribution

behaviour.groupby("subject").agg({
    "trial": "count",
    "congruency": lambda x: x.value_counts().to_dict()
})

In [None]:
#tabla maestra bonita con TODOS los sujetos ‚Äúordenados‚Äù
behaviour.sort_values(["subject", "run", "trial"])

# Machine Learning Pipeline - SciKit Learn

In [None]:
behaviour = pd.read_csv("simon_behaviour_clean.csv")

In [None]:
## SKLEARN CON 80-20
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Features y label (igual que antes)
X = behaviour[["correctness", "Rsponse", "Stimulus", "cond", "StimVar", "accuracy_label"]]
y = behaviour["label"]  # 0 = congruent, 1 = incongruent

# Grupos = sujeto
groups = behaviour["subject"]

In [None]:
import numpy as np

# Todas las features son categ√≥ricas en este caso
categorical_features = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

# ==== GroupShuffleSplit: splits por sujeto ====
gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

accuracies = []

split_id = 1
for train_idx, test_idx in gss.split(X, y, groups):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    print(f"Split {split_id} - Accuracy: {acc:.3f}")
    split_id += 1

print("\nMean accuracy across splits:", np.mean(accuracies))

In [None]:
## SKLEARN CON LOS 21 SUJETOS NO M√ÅS, BASICAMENTE PERFECTO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Copy of the dataframe for ML
df_ml = behaviour.copy()

# Features and label
X = df_ml[["correctness", "Rsponse", "Stimulus", "cond", "StimVar", "accuracy_label"]]
y = df_ml["label"]   # 0 = congruent, 1 = incongruent

# Treat ALL features as categorical
categorical_features = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# Logistic Regression model
clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

# Train‚Äìtest split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Fit model
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Machine Learning Pipeline - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train/test split (ya lo debes tener hecho)
# X_train, X_test, y_train, y_test = ...

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("üîµ RANDOM FOREST RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Visualization of brain images in fMRI

In [None]:
from nilearn import image, plotting
%matplotlib qt

In [None]:
fmri_file = f"{base_path}/sub-01/func/sub-01_task-simon_run-1_bold.nii"
img = image.load_img(fmri_file)

In [None]:
img.shape

In [None]:
fmri_file = f"{base_path}/sub-01/func/sub-01_task-simon_run-1_bold.nii"
img = image.load_img(fmri_file)

# 1) Promedio temporal ‚Üí imagen 3D
mean_img = image.mean_img(img)

# 2) Plot bonito
plotting.plot_epi(mean_img, display_mode="ortho",
                  title="sub-01 Run 1 - Mean BOLD")
plotting.show()

In [None]:
plotting.view_img(mean_img, threshold=None)