# Neuroharmonize datasets

This is a notebook to apply neuroHarmonize: a ComBat-GAM  non-linear allowing algorithm over our data to create neuroHarmonized datasets. It should be run in the `neurogamy` environment.

In [None]:
import os
from neuroHarmonize import harmonizationLearn
import pandas as pd
import numpy as np


In [None]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 
filename_sabre = os.path.join(filepath,'SABRE_pvc2_cleaned.csv') 
filename_insight46 =  os.path.join(filepath,'Insight46_pvc2c.csv') 

In [None]:
# Now we must clean SABRE down to be like the other datasets, including dropping NAns
MRI = pd.read_csv(filename_mri)
TOP = pd.read_csv(filename_top)
Insight46 = pd.read_csv(filename_insight46)
SABRE =  pd.read_csv(filename_sabre)
SABRE.tail(3)

In [None]:
TOP['site'] = 0
MRI['site'] = 1
MRI.head(3)

# Here we will harmonize just TOP and StrokeMRI

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI.head(3) 

In [None]:
sex_mapping = {'F':0,'M':1}
TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(2) 

In [None]:
TOPMRI.columns

In [None]:
TOPMRI_features = TOPMRI[[ 
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b',
    'aca_b',
    'mca_b',
    'pca_b',
    'totalgm_b',
]]
TOPMRI_covariates = TOPMRI[['age', 'sex','site']]
TOPMRI_covariates = TOPMRI_covariates.rename(columns={'site': 'SITE'})
TOPMRI_covariates.head(3)

In [None]:
#TOPMRI_covariates.reset_index()

In [None]:
TOPMRI_features_array = np.array(TOPMRI_features)
TOPMRI_features_array


In [None]:
# run harmonization and PUT the adjusted data into my_ad_data vaiable
my_model, my_data_adj = harmonizationLearn(TOPMRI_features_array, TOPMRI_covariates)

In [None]:
#my_data_adj

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmri = pd.DataFrame(
    my_data_adj, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmri =pd.concat([neuroharmonized_topmri, TOPMRI_covariates.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri = pd.concat([neuroharmonized_topmri, TOPMRI.participant_id.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri.head(3)

In [None]:
# create adjusted csvs
top_neuroharm_to_stroke = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 0]
stroke_neuroharm_to_top = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 1] 

In [None]:
TOP.head(3)

In [None]:
# reorganize to familair pattern
column_to_move1 = stroke_neuroharm_to_top.pop("participant_id")
column_to_move2 = stroke_neuroharm_to_top.pop("age")
column_to_move3 = stroke_neuroharm_to_top.pop("sex")
tcolumn_to_move1 = top_neuroharm_to_stroke.pop("participant_id")
tcolumn_to_move2 = top_neuroharm_to_stroke.pop("age")
tcolumn_to_move3 = top_neuroharm_to_stroke.pop("sex")
stroke_neuroharm_to_top.insert(0, "participant_id", column_to_move1)
stroke_neuroharm_to_top.insert(1, "age", column_to_move2)
stroke_neuroharm_to_top.insert(2, "sex", column_to_move3)
top_neuroharm_to_stroke.insert(0, "participant_id", tcolumn_to_move1)
top_neuroharm_to_stroke.insert(1, "age", tcolumn_to_move2)
top_neuroharm_to_stroke.insert(2, "sex", tcolumn_to_move3)

In [None]:
top_neuroharm_to_stroke = top_neuroharm_to_stroke.drop('SITE', axis=1)
stroke_neuroharm_to_top = stroke_neuroharm_to_top.drop('SITE', axis=1)
stroke_neuroharm_to_top.columns

In [None]:
top_neuroharm_to_stroke.to_csv('../open_work/internal_results/neurocombat/top_neuroharm_to_stroke.csv')
stroke_neuroharm_to_top.to_csv('../open_work/internal_results/neurocombat/stroke_neuroharm_to_top.csv')

# Now we join top and StrokeMRI to one dataset and harmonize to other datasets individually

In [None]:
#TOPMRI

In [None]:
unified_TOPMRI = TOPMRI.copy(deep=True)
unified_TOPMRI = unified_TOPMRI.reset_index()
unified_TOPMRI['site'] = 0
unified_TOPMRI= unified_TOPMRI.drop('index', axis=1)
unified_TOPMRI= unified_TOPMRI.drop('Unnamed: 0', axis=1)

In [None]:
SABRE = SABRE.drop('Unnamed: 0', axis=1)
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE['site'] = 2
SABRE.head(2) 

In [None]:
TOPMRIvsSABRE= pd.concat([unified_TOPMRI, SABRE])
TOPMRIvsSABRE = TOPMRIvsSABRE.reset_index()

In [None]:
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE[['age', 'sex','site']]
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE_covariates.rename(columns={'site': 'SITE'})
TOPMRIvsSABRE_covariates

In [None]:
 TOPMRIvsSABRE_features = TOPMRIvsSABRE[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsSABRE_features_array = np.array(TOPMRIvsSABRE_features)

In [None]:
my_model2, my_data_adj2 = harmonizationLearn(TOPMRIvsSABRE_features_array, TOPMRIvsSABRE_covariates)

In [None]:
my_data_adj2

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivsabre = pd.DataFrame(
    my_data_adj2, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivsabre =pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE_covariates.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre = pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre.head(3)

In [None]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivsabre.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivsabre.pop("age")
column_to_move3 = neuroharmonized_topmrivsabre.pop("sex")
neuroharmonized_topmrivsabre.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivsabre.insert(1, "age", column_to_move2)
neuroharmonized_topmrivsabre.insert(2, "sex", column_to_move3)

In [None]:
# create adjusted csvs
sabre_vs_topmri_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 0]
topmri_vs_sabre_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 2] 

In [None]:
sabre_vs_topmri_only = sabre_vs_topmri_only.drop('SITE', axis=1)
topmri_vs_sabre_only  = topmri_vs_sabre_only .drop('SITE', axis=1)
#topmri_vs_sabre_only.columns

In [None]:
sabre_vs_topmri_only.to_csv('../open_work/internal_results/neurocombat/sabre_vs_topmri_only.csv') 
topmri_vs_sabre_only.to_csv('../open_work/internal_results/neurocombat/topmri_vs_sabre_only.csv') 

In [None]:
Insight46 = Insight46.drop('Unnamed: 0', axis=1)
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
Insight46['site'] = 3
Insight46.head(2) 