# Neuroharmonize datasets

This is a notebook to apply neuroHarmonize: a ComBat-GAM  non-linear allowing algorithm over our data to create neuroHarmonized datasets. It should be run in the `neurogamy` environment.

In [None]:
import os
from neuroHarmonize import harmonizationLearn
import pandas as pd
import numpy as np


In [None]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 
filename_sabre = os.path.join(filepath,'SABRE_pvc2_cleaned.csv') 
filename_insight46 =  os.path.join(filepath,'Insight46_pvc2c.csv') 

In [None]:
# Now we must clean SABRE down to be like the other datasets, including dropping NAns
MRI = pd.read_csv(filename_mri)
TOP = pd.read_csv(filename_top)
Insight46 = pd.read_csv(filename_insight46)
SABRE =  pd.read_csv(filename_sabre)
SABRE.tail(3)

In [None]:
TOP['site'] = 0
MRI['site'] = 1
MRI.head(3)

# Here we will harmonize just TOP and StrokeMRI

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI.head(3) 

In [None]:
sex_mapping = {'F':0,'M':1}
TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(2) 

In [None]:
TOPMRI.columns

In [None]:
TOPMRI_features = TOPMRI[[ 
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b',
    'aca_b',
    'mca_b',
    'pca_b',
    'totalgm_b',
]]
TOPMRI_covariates = TOPMRI[['age', 'sex','site']]
TOPMRI_covariates = TOPMRI_covariates.rename(columns={'site': 'SITE'})
TOPMRI_covariates.head(3)

In [None]:
#TOPMRI_covariates.reset_index()

In [None]:
TOPMRI_features_array = np.array(TOPMRI_features)
TOPMRI_features_array


In [None]:
# run harmonization and PUT the adjusted data into my_ad_data vaiable
my_model, my_data_adj = harmonizationLearn(TOPMRI_features_array, TOPMRI_covariates)

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmri = pd.DataFrame(
    my_data_adj, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmri =pd.concat([neuroharmonized_topmri, TOPMRI_covariates.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri = pd.concat([neuroharmonized_topmri, TOPMRI.participant_id.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri.head(3)

In [None]:
# create adjusted csvs
top_neuroharm_to_stroke = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 0]
stroke_neuroharm_to_top = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 1] 

In [None]:
TOP.head(3)

In [None]:
# reorganize to familair pattern
column_to_move1 = stroke_neuroharm_to_top.pop("participant_id")
column_to_move2 = stroke_neuroharm_to_top.pop("age")
column_to_move3 = stroke_neuroharm_to_top.pop("sex")
tcolumn_to_move1 = top_neuroharm_to_stroke.pop("participant_id")
tcolumn_to_move2 = top_neuroharm_to_stroke.pop("age")
tcolumn_to_move3 = top_neuroharm_to_stroke.pop("sex")
stroke_neuroharm_to_top.insert(0, "participant_id", column_to_move1)
stroke_neuroharm_to_top.insert(1, "age", column_to_move2)
stroke_neuroharm_to_top.insert(2, "sex", column_to_move3)
top_neuroharm_to_stroke.insert(0, "participant_id", tcolumn_to_move1)
top_neuroharm_to_stroke.insert(1, "age", tcolumn_to_move2)
top_neuroharm_to_stroke.insert(2, "sex", tcolumn_to_move3)

In [None]:
top_neuroharm_to_stroke = top_neuroharm_to_stroke.drop('SITE', axis=1)
stroke_neuroharm_to_top = stroke_neuroharm_to_top.drop('SITE', axis=1)
stroke_neuroharm_to_top.columns

In [None]:
top_neuroharm_to_stroke.to_csv('../open_work/internal_results/neurocharm/top_neuroharm_to_stroke.csv')
stroke_neuroharm_to_top.to_csv('../open_work/internal_results/neurocharm/stroke_neuroharm_to_top.csv')

# Now we join top and StrokeMRI to one dataset and harmonize to other datasets individually

In [None]:
unified_TOPMRI = TOPMRI.copy(deep=True)
unified_TOPMRI = unified_TOPMRI.reset_index()
unified_TOPMRI['site'] = 0
unified_TOPMRI= unified_TOPMRI.drop('index', axis=1)
unified_TOPMRI= unified_TOPMRI.drop('Unnamed: 0', axis=1)

In [None]:
SABRE = SABRE.drop('Unnamed: 0', axis=1)
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE['site'] = 2
SABRE.head(2) 

In [None]:
TOPMRIvsSABRE= pd.concat([unified_TOPMRI, SABRE])
TOPMRIvsSABRE = TOPMRIvsSABRE.reset_index()

In [None]:
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE[['age', 'sex','site']]
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE_covariates.rename(columns={'site': 'SITE'})
TOPMRIvsSABRE_covariates.head(3)

In [None]:
 TOPMRIvsSABRE_features = TOPMRIvsSABRE[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsSABRE_features_array = np.array(TOPMRIvsSABRE_features)

In [None]:
my_model2, my_data_adj2 = harmonizationLearn(TOPMRIvsSABRE_features_array, TOPMRIvsSABRE_covariates)

In [None]:
my_data_adj2

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivsabre = pd.DataFrame(
    my_data_adj2, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivsabre =pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE_covariates.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre = pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre.head(3)

In [None]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivsabre.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivsabre.pop("age")
column_to_move3 = neuroharmonized_topmrivsabre.pop("sex")
neuroharmonized_topmrivsabre.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivsabre.insert(1, "age", column_to_move2)
neuroharmonized_topmrivsabre.insert(2, "sex", column_to_move3)

In [None]:
# create adjusted csvs
sabre_vs_topmri_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 2]
topmri_vs_sabre_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 0] 

In [None]:
#topmri_vs_sabre_only

In [None]:
sabre_vs_topmri_only = sabre_vs_topmri_only.drop('SITE', axis=1)
topmri_vs_sabre_only  = topmri_vs_sabre_only .drop('SITE', axis=1)
#topmri_vs_sabre_only.columns

In [None]:
sabre_vs_topmri_only.to_csv('../open_work/internal_results/neurocharm/sabre_vs_topmri_only.csv') 
topmri_vs_sabre_only.to_csv('../open_work/internal_results/neurocharm/topmri_vs_sabre_only.csv') 

In [None]:
Insight46 = Insight46.drop('Unnamed: 0', axis=1)
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
Insight46['site'] = 3
Insight46.head(2) 

In [None]:
TOPMRIvsInsight46= pd.concat([unified_TOPMRI, Insight46])
TOPMRIvsInsight46 = TOPMRIvsInsight46.reset_index()

In [None]:
TOPMRIvsInsight_covariates = TOPMRIvsInsight46[['age', 'sex','site']]
TOPMRIvsInsight_covariates = TOPMRIvsInsight_covariates .rename(columns={'site': 'SITE'})
TOPMRIvsInsight_covariates.head(3)

In [None]:
 TOPMRIvsInsight_features = TOPMRIvsInsight46[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsInsight_features_array = np.array(TOPMRIvsInsight_features)

In [None]:
my_model3, my_data_adj3 = harmonizationLearn(TOPMRIvsInsight_features_array, TOPMRIvsInsight_covariates)

In [None]:
my_data_adj3

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivinsight = pd.DataFrame(
    my_data_adj3, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivinsight = pd.concat([neuroharmonized_topmrivinsight, TOPMRIvsInsight_covariates.reset_index()], axis=1)
neuroharmonized_topmrivinsight = neuroharmonized_topmrivinsight.drop('index', axis=1)
neuroharmonized_topmrivinsight = pd.concat([neuroharmonized_topmrivinsight, TOPMRIvsInsight46.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivinsight = neuroharmonized_topmrivinsight.drop('index', axis=1)
neuroharmonized_topmrivinsight.head(3)

In [None]:
#neuroharmonized_topmrivinsight

In [None]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivinsight.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivinsight.pop("age")
column_to_move3 = neuroharmonized_topmrivinsight.pop("sex")
neuroharmonized_topmrivinsight.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivinsight.insert(1, "age", column_to_move2)
neuroharmonized_topmrivinsight.insert(2, "sex", column_to_move3)

In [None]:
# create adjusted csvs
insight_vs_topmri_only = neuroharmonized_topmrivinsight[neuroharmonized_topmrivinsight.SITE == 3]
topmri_vs_insight_only = neuroharmonized_topmrivinsight[neuroharmonized_topmrivinsight.SITE == 0] 

In [None]:
insight_vs_topmri_only  = insight_vs_topmri_only.drop('SITE', axis=1)
topmri_vs_insight_only  = topmri_vs_insight_only.drop('SITE', axis=1)


In [None]:
#insight_vs_topmri_only

In [None]:
insight_vs_topmri_only.to_csv('../open_work/internal_results/neurocharm/insight_vs_topmri_only.csv') 
topmri_vs_insight_only.to_csv('../open_work/internal_results/neurocharm/topmri_vs_insight_only.csv') 

# Now we will do three way harmonization- TOPMRI vs. SAbre vs. Insigh46

In [None]:
TOPMRIvsSABRvsInisght = pd.concat([unified_TOPMRI, SABRE, Insight46])
TOPMRIvsSABRvsInisght = TOPMRIvsSABRvsInisght.reset_index()

In [None]:
TOPMRIvsSABRvsInisght_covariates = TOPMRIvsSABRvsInisght[['age', 'sex','site']]
TOPMRIvsSABRvsInisght_covariates = TOPMRIvsSABRvsInisght_covariates.rename(columns={'site': 'SITE'})
TOPMRIvsSABRvsInisght_covariates.head(3)

In [None]:
TOPMRIvsSABRvsInisght_features = TOPMRIvsSABRvsInisght[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsSABRvsInisght_features_array = np.array(TOPMRIvsSABRvsInisght_features)

In [None]:
my_model4, my_data_adj4 = harmonizationLearn(TOPMRIvsSABRvsInisght_features_array, TOPMRIvsSABRvsInisght_covariates)

In [None]:
my_data_adj4

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivsabrevinsight = pd.DataFrame(
    my_data_adj4, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivsabrevinsight =pd.concat([neuroharmonized_topmrivsabrevinsight, TOPMRIvsSABRvsInisght_covariates .reset_index()], axis=1)
neuroharmonized_topmrivsabrevinsight = neuroharmonized_topmrivsabrevinsight.drop('index', axis=1)
neuroharmonized_topmrivsabrevinsight = pd.concat([neuroharmonized_topmrivsabrevinsight, TOPMRIvsSABRvsInisght.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivsabrevinsight = neuroharmonized_topmrivsabrevinsight.drop('index', axis=1)
neuroharmonized_topmrivsabrevinsight.head(3)

In [None]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivsabrevinsight.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivsabrevinsight.pop("age")
column_to_move3 = neuroharmonized_topmrivsabrevinsight.pop("sex")
neuroharmonized_topmrivsabrevinsight.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivsabrevinsight.insert(1, "age", column_to_move2)
neuroharmonized_topmrivsabrevinsight.insert(2, "sex", column_to_move3)

In [None]:
# create adjusted csvs
insight_vs_topmri_3way = neuroharmonized_topmrivsabrevinsight[neuroharmonized_topmrivsabrevinsight.SITE == 3]
sabre_vs_topmri_3way = neuroharmonized_topmrivsabrevinsight[neuroharmonized_topmrivsabrevinsight.SITE == 2]
topmri_vs_sabre_3way = neuroharmonized_topmrivsabrevinsight[neuroharmonized_topmrivsabrevinsight.SITE == 0] 

In [None]:
insight_vs_topmri_3way = insight_vs_topmri_3way.drop('SITE', axis=1)
sabre_vs_topmri_3way = sabre_vs_topmri_3way.drop('SITE', axis=1) 
topmri_vs_sabre_3way = topmri_vs_sabre_3way.drop('SITE', axis=1)  

In [None]:
#insight_vs_topmri_3way 

In [None]:
insight_vs_topmri_3way.to_csv('../open_work/internal_results/neurocharm/insight_vs_topmri_3way.csv')
sabre_vs_topmri_3way.to_csv(  '../open_work/internal_results/neurocharm/sabre_vs_topmri_3way.csv')
topmri_vs_sabre_3way.to_csv(  '../open_work/internal_results/neurocharm/topmri_vs_sabre_3way.csv')