## Pipeline fMRI dataset for ML

In [None]:
# LOAD CORE LIBRARIES (FMRI + ML + GLM + DATA)
import os
import glob 

import numpy as np
import pandas as pd

import nilearn
import sklearn
from nilearn import image, plotting
from nilearn.glm.first_level import FirstLevelModel, make_first_level_design_matrix

print("Nilearn:", nilearn.__version__)
print("OK, todo carg√≥, imports fMRI listos")

In [None]:
# SET DATASET ROOT DIRECTORY

base_path = "/Users/antoniaolgui/Library/Mobile Documents/com~apple~CloudDocs/Desktop/simon_ml/ds101_R2.0.0/"

# List folder contents (should show sub-01, sub-02, ...)
os.listdir(base_path)

#Get sorted list of all subject in the dataset
subjects = sorted([d for d in os.listdir(base_path) if d.startswith("sub-")])
subjects

In [None]:
# CHECK UNIQUE trial_type VALUES ACROSS ALL EVENT FILES
# (This reveals how congruency and correctness are encoded)

pattern = os.path.join(base_path, "sub-*", "func", "*_events.tsv")
files = sorted(glob.glob(pattern))

trial_types = set()

for f in files:
    df_tmp = pd.read_csv(f, sep="\t")
    trial_types.update(df_tmp["trial_type"].unique())

trial_types

In [None]:
# BUILD MASTER BEHAVIORAL DATAFRAME
#    - Reads all *_events.tsv files from all subjects
#    - Extracts congruency (congruent / incongruent)
#    - Extracts accuracy (correct / incorrect)
#    - Creates binary ML label (0 = congruent, 1 = incongruent)

pattern = os.path.join(base_path, "sub-*", "func", "*_events.tsv")
files = sorted(glob.glob(pattern))

print("Number of event files found:", len(files))

all_rows = []

for f in files:
    df = pd.read_csv(f, sep="\t")
    
    # Extract subject ID and run filename
    subject = f.split("/")[-3]
    run = f.split("/")[-1]
    
    df["subject"] = subject
    df["run"] = run
    
    # Add continuous trial number within each run
    df["trial"] = range(1, len(df) + 1)
    
    # trial_type format: "congruent_correct", "incongruent_incorrect"
    df["congruency"] = df["trial_type"].str.split("_").str[0]
    df["accuracy_label"] = df["trial_type"].str.split("_").str[1]

    # Binary label for ML
    df["label"] = df["congruency"].map({"congruent": 0, "incongruent": 1})

    # Keep only relevant columns
    df_clean = df[[
        "subject",
        "run",
        "trial",
        "trial_type",
        "congruency",
        "accuracy_label",
        "correctness",
        "StimVar",
        "behav_unlabeled",
        "Rsponse",
        "Stimulus",
        "cond",
        "label"
    ]]
    
    all_rows.append(df_clean)

behaviour = pd.concat(all_rows, ignore_index=True)

behaviour.head(20)

In [None]:
# SAVE CLEAN DATASET + BASIC DESCRIPTIVE STATS
behaviour.to_csv("/Users/antoniaolgui/Desktop/simon_ml/simon_behaviour_clean.csv", 
                 index=False)

print("DataFrame shape:", behaviour.shape)

print("\nTrials per congruency:")
print(behaviour["congruency"].value_counts())

print("\nTrials per subject:")
print(behaviour["subject"].value_counts())

### Summarized tables

In [None]:
#Count of trials per subject

behaviour[behaviour["subject"] == "sub-01"]

In [None]:
#Per-subject summary including congruency distribution

behaviour.groupby("subject").agg({
    "trial": "count",
    "congruency": lambda x: x.value_counts().to_dict()
})

In [None]:
#tabla maestra bonita con TODOS los sujetos ‚Äúordenados‚Äù
behaviour.sort_values(["subject", "run", "trial"])

# Machine Learning Pipeline - SciKit Learn

In [None]:
## SKLEARN CON 80-20
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Features y label (igual que antes)
X = behaviour[["correctness", "Rsponse", "Stimulus", "cond", "StimVar", "accuracy_label"]]
y = behaviour["label"]  # 0 = congruent, 1 = incongruent

# Grupos = sujeto
groups = behaviour["subject"]

In [None]:
import numpy as np

# Todas las features son categ√≥ricas en este caso
categorical_features = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

# ==== GroupShuffleSplit: splits por sujeto ====
gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

accuracies = []

split_id = 1
for train_idx, test_idx in gss.split(X, y, groups):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    print(f"Split {split_id} - Accuracy: {acc:.3f}")
    split_id += 1

print("\nMean accuracy across splits:", np.mean(accuracies))

In [None]:
## SKLEARN CON LOS 21 SUJETOS NO M√ÅS, BASICAMENTE PERFECTO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Copy of the dataframe for ML
df_ml = behaviour.copy()

# Features and label
X = df_ml[["correctness", "Rsponse", "Stimulus", "cond", "StimVar", "accuracy_label"]]
y = df_ml["label"]   # 0 = congruent, 1 = incongruent

# Treat ALL features as categorical
categorical_features = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# Logistic Regression model
clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

# Train‚Äìtest split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Fit model
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Machine Learning Pipeline - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train/test split (ya lo debes tener hecho)
# X_train, X_test, y_train, y_test = ...

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("üîµ RANDOM FOREST RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Visualization of brain images in fMRI

In [None]:
from nilearn import image, plotting
%matplotlib qt

In [None]:
fmri_file = f"{base_path}/sub-01/func/sub-01_task-simon_run-1_bold.nii"
img = image.load_img(fmri_file)

In [None]:
img.shape

In [None]:
fmri_file = f"{base_path}/sub-01/func/sub-01_task-simon_run-1_bold.nii"
img = image.load_img(fmri_file)

# 1) Promedio temporal ‚Üí imagen 3D
mean_img = image.mean_img(img)

# 2) Plot bonito
plotting.plot_epi(mean_img, display_mode="ortho",
                  title="sub-01 Run 1 - Mean BOLD")
plotting.show()

In [None]:
plotting.view_img(mean_img, threshold=None)

## GLM FIRST LEVEL : Congruent vs Incongruent

In [None]:
import os
import numpy as np
import pandas as pd
from nilearn import image, plotting
from nilearn.glm.first_level import FirstLevelModel, make_first_level_design_matrix

# CONFIGURACI√ìN 
base_path = "/Users/antoniaolgui/Library/Mobile Documents/com~apple~CloudDocs/Desktop/simon_ml/ds101_R2.0.0/"
TR = 2.0
runs = [1, 2]

# carpeta donde quedan guardado los mapas z por sujeto
output_dir = "first_level_results"
os.makedirs(output_dir, exist_ok=True)

def map_congruency(tt):
    tt = str(tt).lower()
    if "incongruent" in tt:
        return "incongruent"
    elif "congruent" in tt:
        return "congruent"
    else:
        return None

subjects = [f"sub-{i:02d}" for i in range(1, 22)]   # sub-01 ... sub-21

for subject in subjects:
    print("\n==============================")
    print("Procesando:", subject)
    print("==============================")

    fmri_imgs = []
    design_matrices = []

    for run in runs:
        # Cargar data fMRI 
        fmri_file = os.path.join(
            base_path,
            subject,
            "func",
            f"{subject}_task-simon_run-{run}_bold.nii.gz"
        )
        img = image.load_img(fmri_file)
        fmri_imgs.append(img)

        n_scans = img.shape[-1]
        frame_times = np.arange(n_scans) * TR

        # Cargar eventos y recodificar 
        events_file = os.path.join(
            base_path,
            subject,
            "func",
            f"{subject}_task-simon_run-{run}_events.tsv"
        )
        events = pd.read_csv(events_file, sep="\t")
        events["trial_type"] = events["trial_type"].apply(map_congruency)
        events = events[events["trial_type"].notna()]  # elimina trial types que no sean congruent/incongruent

        # Construcci√≥n de la matriz de dise√±o (HRF Glover, drift coseno, high-pass)
        design = make_first_level_design_matrix(
            frame_times=frame_times,
            events=events,
            hrf_model="glover",
            drift_model="cosine",
            high_pass=0.01
        )
        design_matrices.append(design)

    # Ajuste del modelo GLM de primer nivel
    glm = FirstLevelModel(
        t_r=TR,
        smoothing_fwhm=5.0,
        minimize_memory=True
    ).fit(fmri_imgs, design_matrices=design_matrices)

    # Contraste entre condiciones congruente e incognruente
    z_map = glm.compute_contrast("incongruent - congruent", output_type="z_score")

    # Guardar mapas Z
    out_path = os.path.join(output_dir, f"{subject}_zmap_incongruent_vs_congruent.nii.gz")
    z_map.to_filename(out_path)
    print(f"Mapa guardado en: {out_path}")

In [None]:
#SLICE CENTRAL DE 1 SUJETO PARA VISUALIZAR
#mapa Z es la diferencia entre ambas condici√≥nes congruente e incongruente, pero se puede hacer uno para cada condicion por sujeto

import numpy as np
import matplotlib.pyplot as plt
from nilearn import image

# carga un mapa Z de un sujeto
z_map = image.load_img("first_level_results/sub-01_zmap_incongruent_vs_congruent.nii.gz")

data = z_map.get_fdata()   # esto es un array 3D

# elegir un corte (slice)
slice_idx = data.shape[2] // 2  # corte sagital/axial central

plt.figure(figsize=(6, 6))
plt.imshow(data[:, :, slice_idx], cmap='bwr', origin='lower')
plt.colorbar(label="Z value")
plt.title("Mapa Z (slice central)")
plt.axis("off")
plt.show()