# __Cargar datos__

In [17]:
import joblib
dir_list = [
    {
    "dir_name": "NYU",
    "single_phenotype_file": True,
    },
    {
    "dir_name": "NeuroIMAGE",
    "single_phenotype_file": True,
    },
    {
    "dir_name": "KKI",
    "single_phenotype_file": True,
    },
    {
    "dir_name": "OHSU",
    "single_phenotype_file": True,
    },
    {
    "dir_name": "Peking",
    "single_phenotype_file": False,
    },
    ]

n_rois = 116
all_data = {}
site2idx = {}         # Diccionario para codificar los sitios como enteros
train_ts_list = []
train_labels = []
train_sites = []  
train_ids = []    # <--- Lista paralela con los sitios

import os
print(os.getcwd)

for idx, dataset in enumerate(dir_list):
    dir_name = dataset["dir_name"]
    site2idx[dir_name] = idx          # Asignamos un entero a cada sitio

    data = joblib.load(f"../../data/raw-bold-data/{n_rois}-rois-dataset/{dir_name}.pkl")
    # ts_list, labels = data["data"], data["labels"]
    ids, ts_list, labels = data["id"], data["data"], data["labels"]
    all_data[dir_name] = {"data": ts_list, "labels": labels, 'ids': ids}

    train_ts_list.extend(ts_list)
    train_labels.extend(labels)
    train_ids.extend(ids)
    train_sites.extend([idx] * len(ts_list))  # Añadir código del sitio para cada muestra

    print("Dataset: {}, Count: {}".format(dir_name, len(ts_list)))
    print(10 * "*")

idx2site = {v: k for k, v in site2idx.items()}

# Estadísticas
print("\nTotal de muestras:", len(train_ts_list))
print("Total con TDHA:", sum(train_labels))
print("Número de sitios:", len(site2idx))



<built-in function getcwd>
Dataset: NYU, Count: 177
**********
Dataset: NeuroIMAGE, Count: 39
**********
Dataset: KKI, Count: 78
**********
Dataset: OHSU, Count: 66
**********
Dataset: Peking, Count: 183
**********

Total de muestras: 543
Total con TDHA: 229
Número de sitios: 5


# __DATA ARMONZATION (CoBAT)__

## __Armonización por Media de ROIs__

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from neuroHarmonize import harmonizationLearn

# Paso 1: Calcular la media temporal por ROI para cada sujeto
bold_matrix = np.array([ts.mean(axis=1) for ts in train_ts_list])  # shape (543, 116)

# Paso 2: Normalizar (opcional pero recomendado)
scaler = StandardScaler()
bold_matrix_scaled = scaler.fit_transform(bold_matrix)

# Paso 3: Crear DataFrame con sitio como covariable
site_array = np.array(train_sites)
covars = pd.DataFrame({"SITE": site_array})

# Paso 4: Armonización
model, bold_matrix_harmonized = harmonizationLearn(bold_matrix_scaled, covars)

# Paso 5: Guardar en .csv con IDs si los tienes
df_harmonized = pd.DataFrame(
    bold_matrix_harmonized,
    index=train_ids if 'train_ids' in locals() else None,
    columns=[f"ROI_{i+1}" for i in range(bold_matrix.shape[1])]
)

df_harmonized.head()

Unnamed: 0,ROI_1,ROI_2,ROI_3,ROI_4,ROI_5,ROI_6,ROI_7,ROI_8,ROI_9,ROI_10,...,ROI_107,ROI_108,ROI_109,ROI_110,ROI_111,ROI_112,ROI_113,ROI_114,ROI_115,ROI_116
NYU-4060823,0.5203,-1.344624,0.42232,0.208077,-0.417965,-1.203824,-0.280039,-0.420606,0.244961,0.226322,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
NYU-9578663,-1.389437,-0.248102,0.183519,-0.143829,0.426404,-0.09551,-0.929605,-0.076769,-1.252695,-0.067519,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
NYU-3518345,-1.866871,-0.369938,1.258124,2.671423,3.924503,1.160579,-0.063517,-0.420606,-0.503867,0.716059,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
NYU-3650634,0.759017,-0.613609,-1.249286,0.208077,-0.267185,-0.021622,-0.496561,-0.162728,0.03101,-0.067519,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
NYU-3243657,-0.016814,0.117405,1.019323,0.677286,0.9089,-2.386025,-0.063517,-0.420606,0.03101,-1.634676,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [19]:
import pandas as pd

import os
print(os.getcwd())
labels = pd.read_csv(f"../../data-extraction/output_rois/{n_rois}/mis{n_rois}_rois_names.csv", index_col=0)

print(labels.shape)
labels.head()

/Users/juan/Desktop/fmri/TDHA-fMRI/data/armonized-bold-data
(116, 1)


Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1,Precentral_L
2,Precentral_R
3,Frontal_Sup_L
4,Frontal_Sup_R
5,Frontal_Sup_Orb_L


In [20]:
import pandas as pd
import numpy as np

# Asegúrate de que hay exactamente 116 nombres
roi_names = labels["name"].values
assert len(roi_names) == df_harmonized.shape[1], " Número de ROIs no coincide"

# Crear DataFrame con los nombres correctos como columnas

df_harmonized.columns = roi_names

# Mostrar las primeras filas
print(df_harmonized.shape)
df_harmonized.head()


(543, 116)


Unnamed: 0,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,Frontal_Mid_R,Frontal_Mid_Orb_L,Frontal_Mid_Orb_R,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
NYU-4060823,0.5203,-1.344624,0.42232,0.208077,-0.417965,-1.203824,-0.280039,-0.420606,0.244961,0.226322,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
NYU-9578663,-1.389437,-0.248102,0.183519,-0.143829,0.426404,-0.09551,-0.929605,-0.076769,-1.252695,-0.067519,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
NYU-3518345,-1.866871,-0.369938,1.258124,2.671423,3.924503,1.160579,-0.063517,-0.420606,-0.503867,0.716059,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
NYU-3650634,0.759017,-0.613609,-1.249286,0.208077,-0.267185,-0.021622,-0.496561,-0.162728,0.03101,-0.067519,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
NYU-3243657,-0.016814,0.117405,1.019323,0.677286,0.9089,-2.386025,-0.063517,-0.420606,0.03101,-1.634676,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [21]:
# Crear la columna diagnosis a partir de train_labels
diagnosis = ["ADHD" if label == 1 else "CONTROL" for label in train_labels]

# Insertar como primera columna del DataFrame
df_harmonized.insert(0, "diagnosis", diagnosis)

In [22]:
# Verificación rápida
print(df_harmonized.shape)

df_harmonized[df_harmonized['diagnosis'] == 'ADHD']

(543, 117)


Unnamed: 0,diagnosis,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,Frontal_Mid_R,Frontal_Mid_Orb_L,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
NYU-4060823,ADHD,0.520300,-1.344624,0.422320,0.208077,-0.417965,-1.203824,-0.280039,-0.420606,0.244961,...,-0.900505,-0.307340,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
NYU-3174224,ADHD,0.520300,1.579434,-1.607488,0.677286,0.547028,-1.203824,-0.929605,1.298580,0.458911,...,-0.448370,0.605963,-0.002631,-0.355042,1.329235,-0.542031,0.331436,1.222204,-1.360283,-1.038733
NYU-3653737,ADHD,2.907470,0.482912,0.422320,-1.668758,0.547028,-0.612723,-0.604822,-0.764443,-0.824793,...,0.585081,0.605963,-2.957616,0.004607,-0.498894,1.228877,0.052000,0.193674,0.468977,-0.214480
NYU-5971050,ADHD,-0.912003,-1.100952,0.899922,0.208077,0.185156,-2.386025,2.967789,1.642417,1.742616,...,0.585081,-1.220643,1.153667,1.083554,-0.878317,-0.394455,-0.297293,-0.247125,-0.445653,-0.404692
NYU-3679455,ADHD,0.281583,-0.126266,-0.055282,-0.964945,0.788276,-0.280229,-2.228736,-0.506565,-2.322449,...,0.585081,-0.916209,-0.516541,-0.984427,1.605179,-0.542031,0.610871,0.046741,0.468977,-0.151076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Peking-2601519,ADHD,-0.100866,-0.883361,0.013072,-2.069149,0.831059,-0.768009,1.864773,-0.681777,-1.268851,...,-1.115694,-0.054558,-0.295188,-3.229213,0.893478,0.452046,-1.404924,1.480868,-0.045220,-0.646201
Peking-9002207,ADHD,-0.100866,-0.018097,2.638342,-0.727875,0.831059,-0.054311,-0.891766,-0.025889,1.159286,...,-0.681398,1.447613,0.541104,0.785982,0.083914,0.248835,0.608149,-3.045748,0.199379,0.760545
Peking-2529026,ADHD,-2.201410,0.847166,-0.941572,0.076890,0.008003,2.086784,-0.891766,-0.681777,-0.206541,...,-0.594538,1.147179,-1.131480,-0.820096,-2.344776,-0.360798,2.046058,1.636958,0.443977,-0.411744
Peking-3827352,ADHD,-0.520975,-0.450729,0.132402,0.076890,-0.403525,-0.054311,0.716215,-1.665609,-0.661816,...,2.792977,-0.354993,0.541104,-1.623135,-0.725649,0.452046,-0.542179,-1.484846,-0.534416,-0.997888


In [23]:
# Convertimos los índices numéricos en nombres de sitio
site_names = [idx2site[idx] for idx in train_sites]

# Insertamos como nueva columna (puede ir al final o al inicio)
#df_mean_bold["site_name"] = site_names
df_harmonized.insert(0, "site_name", site_names)

In [24]:
df_harmonized.head()

Unnamed: 0,site_name,diagnosis,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,Frontal_Mid_R,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
NYU-4060823,NYU,ADHD,0.5203,-1.344624,0.42232,0.208077,-0.417965,-1.203824,-0.280039,-0.420606,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
NYU-9578663,NYU,CONTROL,-1.389437,-0.248102,0.183519,-0.143829,0.426404,-0.09551,-0.929605,-0.076769,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
NYU-3518345,NYU,CONTROL,-1.866871,-0.369938,1.258124,2.671423,3.924503,1.160579,-0.063517,-0.420606,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
NYU-3650634,NYU,CONTROL,0.759017,-0.613609,-1.249286,0.208077,-0.267185,-0.021622,-0.496561,-0.162728,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
NYU-3243657,NYU,CONTROL,-0.016814,0.117405,1.019323,0.677286,0.9089,-2.386025,-0.063517,-0.420606,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [25]:
df_harmonized['site_name'].value_counts()

site_name
Peking        183
NYU           177
KKI            78
OHSU           66
NeuroIMAGE     39
Name: count, dtype: int64

In [26]:
# Asegurarte de que el número de IDs coincide con las filas del DataFrame
assert len(train_ids) == df_harmonized.shape[0], "❌ Número de IDs no coincide con las filas del DataFrame"

# Agregar la columna
#df_mean_bold["subject_id"] = train_ids

df_harmonized.insert(0, "subject_id", train_ids)

# Verificación
df_harmonized.head()

Unnamed: 0,subject_id,site_name,diagnosis,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,Frontal_Sup_Orb_R,Frontal_Mid_L,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
NYU-4060823,NYU-4060823,NYU,ADHD,0.5203,-1.344624,0.42232,0.208077,-0.417965,-1.203824,-0.280039,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
NYU-9578663,NYU-9578663,NYU,CONTROL,-1.389437,-0.248102,0.183519,-0.143829,0.426404,-0.09551,-0.929605,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
NYU-3518345,NYU-3518345,NYU,CONTROL,-1.866871,-0.369938,1.258124,2.671423,3.924503,1.160579,-0.063517,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
NYU-3650634,NYU-3650634,NYU,CONTROL,0.759017,-0.613609,-1.249286,0.208077,-0.267185,-0.021622,-0.496561,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
NYU-3243657,NYU-3243657,NYU,CONTROL,-0.016814,0.117405,1.019323,0.677286,0.9089,-2.386025,-0.063517,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [27]:
df_phenotypic = pd.read_csv("../../data/phenotypic/all_phenotypic_single_df/all_phenotypic_df.csv")
df_phenotypic.head()

Unnamed: 0,ScanDir ID,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,...,Full4 IQ,Med Status,QC_Rest_1,QC_Rest_2,QC_Rest_3,QC_Rest_4,QC_Anatomical_1,QC_Anatomical_2,dataset,id
0,1084283,6,1.0,11.0,1.0,1,,3.0,,69.0,...,110.0,2.0,0.0,1.0,1.0,,1.0,,OHSU,OHSU-1084283
1,1084884,6,0.0,9.416667,1.0,0,enuresis,3.0,,48.0,...,106.0,1.0,1.0,1.0,1.0,,1.0,,OHSU,OHSU-1084884
2,1108916,6,1.0,8.5,1.0,1,ODD,3.0,,70.0,...,116.0,1.0,1.0,1.0,1.0,,1.0,,OHSU,OHSU-1108916
3,1206380,6,1.0,9.166667,1.0,3,,3.0,,63.0,...,90.0,1.0,1.0,1.0,1.0,,1.0,,OHSU,OHSU-1206380
4,1340333,6,0.0,7.416667,1.0,1,,3.0,,80.0,...,98.0,2.0,1.0,1.0,1.0,,1.0,,OHSU,OHSU-1340333


In [28]:
# Asegúrate de que los nombres de columna estén como strings (por si acaso)
df_harmonized["subject_id"] = df_harmonized["subject_id"].astype(str)
df_phenotypic["id"] = df_phenotypic["id"].astype(str)

# Hacer merge (left join) asegurando que subject_id de df_mean_bold coincida con id de df_phenotypic
merged_df = df_harmonized.merge(df_phenotypic[["id", "Gender", "Age"]],
                                left_on="subject_id", right_on="id", how="left")

# Eliminar la columna 'id' redundante
merged_df.drop(columns=["id"], inplace=True)

# Reordenar columnas: queremos que Gender y Age estén como columnas 4 y 5 (índices 3 y 4)
# Paso 1: Obtener todas las columnas
cols = merged_df.columns.tolist()

# Paso 2: Mover 'Gender' y 'Age' a la posición 3 y 4
# Nota: subject_id (0), diagnosis (1), site_name (2)
cols.insert(3, cols.pop(cols.index("Gender")))
cols.insert(4, cols.pop(cols.index("Age")))

# Paso 3: Reorganizar el DataFrame
df_mean_bold = merged_df[cols]

# Verificación

df_mean_bold.head()



Unnamed: 0,subject_id,site_name,diagnosis,Gender,Age,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
0,NYU-4060823,NYU,ADHD,1.0,11.8,0.5203,-1.344624,0.42232,0.208077,-0.417965,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
1,NYU-9578663,NYU,CONTROL,0.0,8.26,-1.389437,-0.248102,0.183519,-0.143829,0.426404,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
2,NYU-3518345,NYU,CONTROL,0.0,7.67,-1.866871,-0.369938,1.258124,2.671423,3.924503,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
3,NYU-3650634,NYU,CONTROL,0.0,13.25,0.759017,-0.613609,-1.249286,0.208077,-0.267185,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
4,NYU-3243657,NYU,CONTROL,1.0,8.56,-0.016814,0.117405,1.019323,0.677286,0.9089,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [29]:
# Reemplazar los valores de Gender
df_mean_bold["Gender"] = df_mean_bold["Gender"].map({1: "M", 0: "F"})

# Verificación rápida
df_mean_bold["Gender"].value_counts()

Gender
M    348
F    195
Name: count, dtype: int64

In [30]:
df_mean_bold.head()

Unnamed: 0,subject_id,site_name,diagnosis,Gender,Age,Precentral_L,Precentral_R,Frontal_Sup_L,Frontal_Sup_R,Frontal_Sup_Orb_L,...,Cerebelum_10_L,Cerebelum_10_R,Vermis_1_2,Vermis_3,Vermis_4_5,Vermis_6,Vermis_7,Vermis_8,Vermis_9,Vermis_10
0,NYU-4060823,NYU,ADHD,M,11.8,0.5203,-1.344624,0.42232,0.208077,-0.417965,...,-0.900505,-0.30734,-0.388064,-1.433988,0.501403,-0.689606,-0.995881,-0.834857,0.011662,-0.119374
1,NYU-9578663,NYU,CONTROL,F,8.26,-1.389437,-0.248102,0.183519,-0.143829,0.426404,...,-0.625995,-0.497612,0.960951,0.723905,-0.602373,-0.542031,1.728611,-1.128723,-0.140776,-0.404692
2,NYU-3518345,NYU,CONTROL,F,7.67,-1.866871,-0.369938,1.258124,2.671423,3.924503,...,-1.933956,-0.916209,-0.645019,-0.71469,1.605179,0.638575,0.052,-0.614458,2.831772,0.10254
3,NYU-3650634,NYU,CONTROL,F,13.25,0.759017,-0.613609,-1.249286,0.208077,-0.267185,...,-0.060826,0.149311,0.511279,0.004607,1.053291,-0.320667,0.052,-0.320592,0.926292,0.609773
4,NYU-3243657,NYU,CONTROL,M,8.56,-0.016814,0.117405,1.019323,0.677286,0.9089,...,-0.125417,-0.611775,-2.379466,0.364256,2.157067,2.999785,0.610871,1.809936,-1.970037,0.10254


In [31]:
df_mean_bold.shape

(543, 121)

In [32]:
df_mean_bold.to_csv("arm-mean-bold-data-116/arm-mean_bold_matrix.csv", index=False)
print("✅ Guardado como 'arm-mean_bold_matrix.csv'")

✅ Guardado como 'arm-mean_bold_matrix.csv'
