# Neuroharmonize datasets

This is a notebook to apply neuroHarmonize: a ComBat-GAM  non-linear allowing algorithm over our data to create neuroHarmonized datasets. It should be run in the `neurogamy` environment.

In [1]:
import os
from neuroHarmonize import harmonizationLearn
import pandas as pd
import numpy as np


In [2]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 
filename_sabre = os.path.join(filepath,'SABRE_pvc2_cleaned.csv') 
filename_insight46 =  os.path.join(filepath,'Insight46_pvc2c.csv') 

In [3]:
# Now we must clean SABRE down to be like the other datasets, including dropping NAns
MRI = pd.read_csv(filename_mri)
TOP = pd.read_csv(filename_top)
Insight46 = pd.read_csv(filename_insight46)
SABRE =  pd.read_csv(filename_sabre)
SABRE.tail(3)

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
686,725,sub-97154_1_ses-1_run-1,71,M,0.4942,0.43,0.26314,0.41622,0.41622,2.168,...,28.5449,4.0986,3.8984,2.774,3.4137,13.4864,53.8449,46.8761,39.5245,45.0268
687,726,sub-97819_1_ses-1_run-1,72,M,0.582,0.51335,0.3998,0.38926,0.38926,14.889,...,7.9896,3.441,3.4441,3.6305,3.1638,18.682,68.1887,55.7775,41.6975,50.485
688,727,sub-97905_1_ses-1_run-1,71,M,0.64023,0.57521,0.48202,0.37717,0.37717,3.042,...,21.2033,4.5504,4.9379,4.381,4.2747,10.5935,42.1268,36.5742,35.7132,35.3186


In [4]:
TOP['site'] = 0
MRI['site'] = 1
MRI.head(3)

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,1,sub-59082_1_ses-1_run-1,43.172603,F,0.64754,0.49441,0.3132,0.445,0.78476,1.249,...,1.7273,1.9776,1.7141,1.914,26.8262,104.3155,90.5047,73.8758,84.4527,1
1,2,sub-59083_1_ses-1_run-1,66.367123,F,0.60517,0.48594,0.42304,0.39968,0.72061,14.597,...,1.9669,1.7154,2.6394,1.8815,24.9597,75.7051,65.3032,65.6771,67.1158,1
2,4,sub-59085_1_ses-1_run-1,55.838356,F,0.61724,0.53779,0.33692,0.41371,0.77417,6.341,...,1.8146,1.9845,2.0827,1.9501,27.7117,81.0966,72.998,60.8317,64.3183,1


# Here we will harmonize just TOP and StrokeMRI

In [5]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI.head(3) 

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,0,sub-0001_1_ses-1_run-1,43.49,M,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,...,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981,0
1,1,sub-0002_1_ses-1_run-1,38.3,F,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,...,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047,0
2,2,sub-0019_1_ses-1_run-1,32.3,M,0.71224,0.53295,0.33594,0.45046,0.78753,0.621,...,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588,0


In [6]:
sex_mapping = {'F':0,'M':1}
TOPMRI = TOPMRI.assign(sex = TOPMRI.sex.map(sex_mapping))
TOPMRI.head(2) 

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,0,sub-0001_1_ses-1_run-1,43.49,1,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,...,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981,0
1,1,sub-0002_1_ses-1_run-1,38.3,0,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,...,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047,0


In [7]:
TOPMRI.columns

Index(['Unnamed: 0', 'participant_id', 'age', 'sex', 'gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b', 'site'],
      dtype='object')

In [8]:
TOPMRI_features = TOPMRI[[ 
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b',
    'aca_b',
    'mca_b',
    'pca_b',
    'totalgm_b',
]]
TOPMRI_covariates = TOPMRI[['age', 'sex','site']]
TOPMRI_covariates = TOPMRI_covariates.rename(columns={'site': 'SITE'})
TOPMRI_covariates.head(3)

Unnamed: 0,age,sex,SITE
0,43.49,1,0
1,38.3,0,0
2,32.3,1,0


In [9]:
#TOPMRI_covariates.reset_index()

In [10]:
TOPMRI_features_array = np.array(TOPMRI_features)
TOPMRI_features_array


array([[ 0.71736,  0.52803,  0.31812, ..., 68.3224 , 52.4614 , 60.6981 ],
       [ 0.72383,  0.62394,  0.25673, ..., 78.0359 , 63.9932 , 71.6047 ],
       [ 0.71224,  0.53295,  0.33594, ..., 86.3816 , 62.6012 , 74.0588 ],
       ...,
       [ 0.58675,  0.47585,  0.42899, ..., 63.8049 , 52.7282 , 54.3541 ],
       [ 0.63929,  0.55235,  0.39661, ..., 65.1286 , 56.4845 , 59.2195 ],
       [ 0.62909,  0.53857,  0.4307 , ..., 70.4016 , 58.7551 , 64.2534 ]])

In [11]:
# run harmonization and PUT the adjusted data into my_ad_data vaiable
my_model, my_data_adj = harmonizationLearn(TOPMRI_features_array, TOPMRI_covariates)

In [12]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmri = pd.DataFrame(
    my_data_adj, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmri =pd.concat([neuroharmonized_topmri, TOPMRI_covariates.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri = pd.concat([neuroharmonized_topmri, TOPMRI.participant_id.reset_index()], axis=1)
neuroharmonized_topmri = neuroharmonized_topmri.drop('index', axis=1)
neuroharmonized_topmri.head(3)

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,...,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex,SITE,participant_id
0,0.71952,0.528887,0.316717,0.459496,0.797135,-2.125587,23.986463,8.545204,2.35546,2.377497,...,2.505583,22.781311,80.843615,69.311601,53.375493,61.8034,43.49,1,0,sub-0001_1_ses-1_run-1
1,0.724281,0.623166,0.257423,0.451066,0.841799,2.764311,25.715519,8.857782,1.767552,2.471752,...,1.985459,23.914179,87.806105,78.508679,64.398092,72.100604,38.3,0,0,sub-0002_1_ses-1_run-1
2,0.715623,0.533841,0.340081,0.450446,0.785656,-0.814774,7.373833,8.697634,2.01692,1.868859,...,2.204221,26.722237,93.302945,85.233112,62.55395,73.73482,32.3,1,0,sub-0019_1_ses-1_run-1


In [13]:
# create adjusted csvs
top_neuroharm_to_stroke = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 0]
stroke_neuroharm_to_top = neuroharmonized_topmri[neuroharmonized_topmri.SITE == 1] 

In [14]:
TOP.head(3)

Unnamed: 0.1,Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,0,sub-0001_1_ses-1_run-1,43.49,M,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,...,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981,0
1,1,sub-0002_1_ses-1_run-1,38.3,F,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,...,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047,0
2,2,sub-0019_1_ses-1_run-1,32.3,M,0.71224,0.53295,0.33594,0.45046,0.78753,0.621,...,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588,0


In [15]:
# reorganize to familair pattern
column_to_move1 = stroke_neuroharm_to_top.pop("participant_id")
column_to_move2 = stroke_neuroharm_to_top.pop("age")
column_to_move3 = stroke_neuroharm_to_top.pop("sex")
tcolumn_to_move1 = top_neuroharm_to_stroke.pop("participant_id")
tcolumn_to_move2 = top_neuroharm_to_stroke.pop("age")
tcolumn_to_move3 = top_neuroharm_to_stroke.pop("sex")
stroke_neuroharm_to_top.insert(0, "participant_id", column_to_move1)
stroke_neuroharm_to_top.insert(1, "age", column_to_move2)
stroke_neuroharm_to_top.insert(2, "sex", column_to_move3)
top_neuroharm_to_stroke.insert(0, "participant_id", tcolumn_to_move1)
top_neuroharm_to_stroke.insert(1, "age", tcolumn_to_move2)
top_neuroharm_to_stroke.insert(2, "sex", tcolumn_to_move3)

In [16]:
top_neuroharm_to_stroke = top_neuroharm_to_stroke.drop('SITE', axis=1)
stroke_neuroharm_to_top = stroke_neuroharm_to_top.drop('SITE', axis=1)
stroke_neuroharm_to_top.columns

Index(['participant_id', 'age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b'],
      dtype='object')

In [17]:
top_neuroharm_to_stroke.to_csv('../open_work/internal_results/neurocombat/top_neuroharm_to_stroke.csv')
stroke_neuroharm_to_top.to_csv('../open_work/internal_results/neurocombat/stroke_neuroharm_to_top.csv')

# Now we join top and StrokeMRI to one dataset and harmonize to other datasets individually

In [18]:
unified_TOPMRI = TOPMRI.copy(deep=True)
unified_TOPMRI = unified_TOPMRI.reset_index()
unified_TOPMRI['site'] = 0
unified_TOPMRI= unified_TOPMRI.drop('index', axis=1)
unified_TOPMRI= unified_TOPMRI.drop('Unnamed: 0', axis=1)

In [19]:
SABRE = SABRE.drop('Unnamed: 0', axis=1)
SABRE = SABRE.assign(sex = SABRE.sex.map(sex_mapping))
SABRE['site'] = 2
SABRE.head(2) 

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,sub-100096_1_ses-1_run-1,78,1,0.56201,0.44945,0.42419,0.39147,0.39147,4.621,19.0,...,3.5865,3.8956,4.3534,3.669,15.3022,71.6047,62.0948,43.2926,54.7345,2
1,sub-100331_1_ses-1_run-1,71,1,0.53812,0.42593,0.49319,0.36927,0.36927,1.085,22.0,...,4.3543,4.1134,3.8613,3.7593,11.3323,37.0396,30.0755,25.5827,29.7665,2


In [20]:
TOPMRIvsSABRE= pd.concat([unified_TOPMRI, SABRE])
TOPMRIvsSABRE = TOPMRIvsSABRE.reset_index()

In [21]:
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE[['age', 'sex','site']]
TOPMRIvsSABRE_covariates = TOPMRIvsSABRE_covariates.rename(columns={'site': 'SITE'})
TOPMRIvsSABRE_covariates.head(3)

Unnamed: 0,age,sex,SITE
0,43.49,1,0
1,38.3,0,0
2,32.3,1,0


In [22]:
 TOPMRIvsSABRE_features = TOPMRIvsSABRE[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsSABRE_features_array = np.array(TOPMRIvsSABRE_features)

In [23]:
my_model2, my_data_adj2 = harmonizationLearn(TOPMRIvsSABRE_features_array, TOPMRIvsSABRE_covariates)

In [24]:
my_data_adj2

array([[ 0.68699365,  0.50694735,  0.27941152, ..., 62.72723391,
        48.85471417, 56.80895511],
       [ 0.69419083,  0.60424212,  0.22249092, ..., 71.89159034,
        59.81867949, 67.06576195],
       [ 0.68133834,  0.51182103,  0.30728855, ..., 79.15124221,
        58.29769752, 69.1114069 ],
       ...,
       [ 0.54164535,  0.46218364,  0.33023661, ..., 55.91669092,
        45.44952533, 51.73366022],
       [ 0.62743143,  0.54428816,  0.44651563, ..., 67.74409769,
        48.14704131, 58.87703046],
       [ 0.68442839,  0.60524651,  0.5155167 , ..., 42.36226434,
        40.81083848, 39.20783802]])

In [25]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivsabre = pd.DataFrame(
    my_data_adj2, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivsabre =pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE_covariates.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre = pd.concat([neuroharmonized_topmrivsabre, TOPMRIvsSABRE.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivsabre = neuroharmonized_topmrivsabre.drop('index', axis=1)
neuroharmonized_topmrivsabre.head(3)

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,...,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex,SITE,participant_id
0,0.686994,0.506947,0.279412,0.463444,0.667547,-1.946746,21.37656,21.718511,5.537488,5.250276,...,6.158162,17.739665,74.13819,62.727234,48.854714,56.808955,43.49,1,0,sub-0001_1_ses-1_run-1
1,0.694191,0.604242,0.222491,0.451907,0.710344,-0.913792,21.441449,19.716819,2.279587,8.26623,...,3.651072,19.022197,80.901365,71.89159,59.818679,67.065762,38.3,0,0,sub-0002_1_ses-1_run-1
2,0.681338,0.511821,0.307289,0.451491,0.659829,-2.383069,8.649002,16.077552,3.90532,0.457469,...,4.114457,22.132086,86.857103,79.151242,58.297698,69.111407,32.3,1,0,sub-0019_1_ses-1_run-1


In [26]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivsabre.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivsabre.pop("age")
column_to_move3 = neuroharmonized_topmrivsabre.pop("sex")
neuroharmonized_topmrivsabre.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivsabre.insert(1, "age", column_to_move2)
neuroharmonized_topmrivsabre.insert(2, "sex", column_to_move3)

In [27]:
# create adjusted csvs
sabre_vs_topmri_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 2]
topmri_vs_sabre_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 0] 

In [28]:
#topmri_vs_sabre_only

In [29]:
sabre_vs_topmri_only = sabre_vs_topmri_only.drop('SITE', axis=1)
topmri_vs_sabre_only  = topmri_vs_sabre_only .drop('SITE', axis=1)
#topmri_vs_sabre_only.columns

In [30]:
sabre_vs_topmri_only.to_csv('../open_work/internal_results/neurocombat/sabre_vs_topmri_only.csv') 
topmri_vs_sabre_only.to_csv('../open_work/internal_results/neurocombat/topmri_vs_sabre_only.csv') 

In [31]:
Insight46 = Insight46.drop('Unnamed: 0', axis=1)
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
Insight46['site'] = 3
Insight46.head(2) 

Unnamed: 0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,...,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,site
0,sub-10015124_1_ses-1_run-1,69.733333,0,0.56794,0.47687,0.34052,0.40997,0.7542,4.889,16,...,2.9146,2.6976,2.9089,2.7193,35.4184,98.9688,95.3166,78.4416,82.1259,3
1,sub-10024822_1_ses-1_run-1,70.288889,0,0.54998,0.41823,0.39773,0.40264,0.70882,24.116,29,...,3.1958,2.6274,3.5953,3.0085,39.2905,143.9933,150.7895,105.6454,115.6007,3


In [32]:
TOPMRIvsInsight46= pd.concat([unified_TOPMRI, Insight46])
TOPMRIvsInsight46 = TOPMRIvsInsight46.reset_index()

In [33]:
TOPMRIvsInsight_covariates = TOPMRIvsInsight46[['age', 'sex','site']]
TOPMRIvsInsight_covariates = TOPMRIvsInsight_covariates .rename(columns={'site': 'SITE'})
TOPMRIvsInsight_covariates.head(3)

Unnamed: 0,age,sex,SITE
0,43.49,1,0
1,38.3,0,0
2,32.3,1,0


In [34]:
 TOPMRIvsInsight_features = TOPMRIvsInsight46[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsInsight_features_array = np.array(TOPMRIvsInsight_features)

In [35]:
my_model3, my_data_adj3 = harmonizationLearn(TOPMRIvsInsight_features_array, TOPMRIvsInsight_covariates)

In [36]:
my_data_adj3

array([[ 0.71539479,  0.52538533,  0.30758639, ..., 69.29867825,
        53.04298325, 61.13849875],
       [ 0.72125239,  0.6199576 ,  0.24644146, ..., 79.02322586,
        64.68324365, 72.09321956],
       [ 0.710719  ,  0.53033256,  0.32597292, ..., 88.04552701,
        63.54041482, 74.75626841],
       ...,
       [ 0.70042487,  0.59930028,  0.49799121, ..., 55.67522739,
        30.21672751, 45.28347242],
       [ 0.60026389,  0.48935446,  0.54039561, ..., 27.60235627,
        24.04997106, 22.0773378 ],
       [ 0.618048  ,  0.52442327,  0.4688157 , ..., 49.25025668,
        28.73974051, 38.56416721]])

In [37]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivinsight = pd.DataFrame(
    my_data_adj3, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivinsight = pd.concat([neuroharmonized_topmrivinsight, TOPMRIvsInsight_covariates.reset_index()], axis=1)
neuroharmonized_topmrivinsight = neuroharmonized_topmrivinsight.drop('index', axis=1)
neuroharmonized_topmrivinsight = pd.concat([neuroharmonized_topmrivinsight, TOPMRIvsInsight46.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivinsight = neuroharmonized_topmrivinsight.drop('index', axis=1)
neuroharmonized_topmrivinsight.head(3)

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,...,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex,SITE,participant_id
0,0.715395,0.525385,0.307586,0.461445,0.801134,-0.236371,22.150751,15.176637,3.214088,3.143109,...,3.732466,21.40989,81.005047,69.298678,53.042983,61.138499,43.49,1,0,sub-0001_1_ses-1_run-1
1,0.721252,0.619958,0.246441,0.454019,0.844941,-0.236345,21.275381,19.784068,2.613221,4.213335,...,2.820666,22.866645,87.759503,79.023226,64.683244,72.09322,38.3,0,0,sub-0002_1_ses-1_run-1
2,0.710719,0.530333,0.325973,0.453337,0.791318,-1.293781,10.885923,12.236741,2.308807,1.352756,...,2.790751,26.600794,95.090281,88.045527,63.540415,74.756268,32.3,1,0,sub-0019_1_ses-1_run-1


In [38]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivinsight.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivinsight.pop("age")
column_to_move3 = neuroharmonized_topmrivinsight.pop("sex")
neuroharmonized_topmrivinsight.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivinsight.insert(1, "age", column_to_move2)
neuroharmonized_topmrivinsight.insert(2, "sex", column_to_move3)

In [39]:
# create adjusted csvs
insight_vs_topmri_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 3]
topmri_vs_insight_only = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 0] 

In [40]:
insight_vs_topmri_only  = insight_vs_topmri_only .drop('SITE', axis=1)
topmri_vs_insight_only  = topmri_vs_insight_only .drop('SITE', axis=1)


In [41]:
insight_vs_topmri_only.to_csv('../open_work/internal_results/neurocombat/insight_vs_topmri_only.csv') 
topmri_vs_insight_only.to_csv('../open_work/internal_results/neurocombat/topmri_vs_insight_only.csv') 

# Now we will do three way harmonization- TOPMRI vs. SAbre vs. Insigh46

In [42]:
TOPMRIvsSABRvsInisght = pd.concat([unified_TOPMRI, SABRE, Insight46])
TOPMRIvsSABRvsInisght = TOPMRIvsSABRvsInisght.reset_index()

In [43]:
TOPMRIvsSABRvsInisght_covariates = TOPMRIvsSABRvsInisght[['age', 'sex','site']]
TOPMRIvsSABRvsInisght_covariates = TOPMRIvsSABRvsInisght_covariates.rename(columns={'site': 'SITE'})
TOPMRIvsSABRvsInisght_covariates.head(3)

Unnamed: 0,age,sex,SITE
0,43.49,1,0
1,38.3,0,0
2,32.3,1,0


In [44]:
TOPMRIvsSABRvsInisght_features = TOPMRIvsSABRvsInisght[[ 
     'gm_vol',
     'wm_vol',
     'csf_vol',
     'gm_ivc_ratio',
     'gmwm_ivc_ratio',
     'wmh_vol',
     'wmh_count',
     'deepwm_b_cov',
     'aca_b_cov',
     'mca_b_cov',
     'pca_b_cov',
     'totalgm_b_cov',
     'deepwm_b',
     'aca_b',
     'mca_b',
     'pca_b',
     'totalgm_b',
]]
TOPMRIvsSABRvsInisght_features_array = np.array(TOPMRIvsSABRvsInisght_features)

In [45]:
my_model4, my_data_adj4 = harmonizationLearn(TOPMRIvsSABRvsInisght_features_array, TOPMRIvsSABRvsInisght_covariates)

In [46]:
my_data_adj4

array([[ 0.68969839,  0.50862087,  0.27728126, ..., 63.94321353,
        49.53495206, 57.43364419],
       [ 0.69632703,  0.60467998,  0.22014878, ..., 73.5198354 ,
        60.89075541, 68.06052495],
       [ 0.68445914,  0.51353588,  0.30406524, ..., 81.2375548 ,
        59.39937106, 70.16026523],
       ...,
       [ 0.67844272,  0.58116891,  0.48185612, ..., 51.42277988,
        29.08355931, 43.08885203],
       [ 0.57538679,  0.46844211,  0.53069476, ..., 26.52968691,
        23.47728822, 22.32731612],
       [ 0.59374851,  0.50441787,  0.44829172, ..., 45.7068618 ,
        27.73207664, 37.05976004]])

In [47]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_topmrivsabrevinsight = pd.DataFrame(
    my_data_adj4, 
    columns = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']
)
neuroharmonized_topmrivsabrevinsight =pd.concat([neuroharmonized_topmrivsabrevinsight, TOPMRIvsSABRvsInisght_covariates .reset_index()], axis=1)
neuroharmonized_topmrivsabrevinsight = neuroharmonized_topmrivsabrevinsight.drop('index', axis=1)
neuroharmonized_topmrivsabrevinsight = pd.concat([neuroharmonized_topmrivsabrevinsight, TOPMRIvsSABRvsInisght.participant_id.reset_index()], axis=1)
neuroharmonized_topmrivsabrevinsight = neuroharmonized_topmrivsabrevinsight.drop('index', axis=1)
neuroharmonized_topmrivsabrevinsight.head(3)

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,...,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b,age,sex,SITE,participant_id
0,0.689698,0.508621,0.277281,0.464523,0.68849,-2.798339,20.408456,21.745313,4.981418,4.581377,...,5.600121,17.855192,75.199365,63.943214,49.534952,57.433644,43.49,1,0,sub-0001_1_ses-1_run-1
1,0.696327,0.60468,0.220149,0.453603,0.731685,-1.904613,20.374403,23.224643,3.145324,8.410883,...,4.125657,19.322143,82.138176,73.519835,60.890755,68.060525,38.3,0,0,sub-0002_1_ses-1_run-1
2,0.684459,0.513536,0.304065,0.453203,0.680045,-3.286221,7.768786,15.262066,3.351787,0.163685,...,3.697,22.208267,88.493861,81.237555,59.399371,70.160265,32.3,1,0,sub-0019_1_ses-1_run-1


In [48]:
# reorganize to familair pattern
column_to_move1 = neuroharmonized_topmrivsabrevinsight.pop("participant_id")
column_to_move2 = neuroharmonized_topmrivsabrevinsight.pop("age")
column_to_move3 = neuroharmonized_topmrivsabrevinsight.pop("sex")
neuroharmonized_topmrivsabrevinsight.insert(0, "participant_id", column_to_move1)
neuroharmonized_topmrivsabrevinsight.insert(1, "age", column_to_move2)
neuroharmonized_topmrivsabrevinsight.insert(2, "sex", column_to_move3)

In [49]:
# create adjusted csvs
insight_vs_topmri_3way = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 3]
sabre_vs_topmri_3way = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 2]
topmri_vs_sabre_3way = neuroharmonized_topmrivsabre[neuroharmonized_topmrivsabre.SITE == 0] 

In [50]:
insight_vs_topmri_3way = insight_vs_topmri_3way.drop('SITE', axis=1)
sabre_vs_topmri_3way = topmri_vs_sabre_3way.drop('SITE', axis=1) 
topmri_vs_sabre_3way = topmri_vs_sabre_3way.drop('SITE', axis=1)  

In [51]:
insight_vs_topmri_3way.to_csv('insight_vs_topmri_3way.csv')
sabre_vs_topmri_3way.to_csv('sabre_vs_topmri_3way.csv')
topmri_vs_sabre_3way.to_csv('topmri_vs_sabre_3way.csv')