# Neurocombat TOP and StrokeMRI harmonized datasets

Note this must be run in the `neuron` environment

## import libraries

In [1]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [2]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 

In [3]:
#pd.read_csv(filename_mri)

In [4]:
def prep_for_neurocombat(dataframename1, dataframename2):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    our_1_frame = pd.read_csv(dataframename1)
    our_2_frame = pd.read_csv(dataframename2)
    two_selection = our_2_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = our_1_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [5]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat(filename_mri, filename_top)

Nan count 0


In [6]:
our_mri_data = pd.read_csv(filename_mri)
our_top_data = pd.read_csv(filename_top)

In [7]:
# # save off csv
# both_together.to_csv('both_top_mri_together.csv')

In [8]:
# # make and save of csv of features only
# features_only = both_together[2:]
# #features_only.to_csv('features_only_top_mri.csv')

In [9]:
ftF.to_csv('ftF_top_mri.csv')

In [10]:
data = np.genfromtxt('ftF_top_mri.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

array([[ 0.64754,  0.60517,  0.61724, ...,  0.73473,  0.71094,  0.67941],
       [ 0.49441,  0.48594,  0.53779, ...,  0.53549,  0.60974,  0.50902],
       [ 0.3132 ,  0.42304,  0.33692, ...,  0.35985,  0.3748 ,  0.30031],
       ...,
       [90.5047 , 65.3032 , 72.998  , ..., 63.441  , 56.621  , 70.7366 ],
       [73.8758 , 65.6771 , 60.8317 , ..., 43.3879 , 43.5347 , 46.0536 ],
       [84.4527 , 67.1158 , 64.3183 , ..., 54.4204 , 51.4417 , 57.9078 ]])

In [11]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [12]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data


In [13]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040
0,0.6448,0.60338,0.614763,0.631335,0.569735,0.644628,0.592407,0.578482,0.540655,0.525945,...,0.652212,0.669463,0.743761,0.774069,0.763212,0.78243,0.716511,0.737243,0.713133,0.681678
1,0.49387,0.485493,0.537819,0.484823,0.455592,0.512572,0.425742,0.419686,0.43238,0.433754,...,0.537577,0.563887,0.522942,0.574,0.514834,0.495592,0.486692,0.536342,0.609741,0.509463
2,0.309698,0.417703,0.335135,0.298599,0.349758,0.353158,0.308559,0.311604,0.42305,0.437081,...,0.351663,0.371802,0.307763,0.24021,0.313673,0.252425,0.27482,0.365677,0.378094,0.306567
3,0.444971,0.399662,0.413834,0.446481,0.414994,0.42791,0.447008,0.442423,0.38736,0.37684,...,0.421932,0.416367,0.474555,0.486387,0.480274,0.514005,0.486336,0.450805,0.419155,0.456247
4,0.785736,0.72208,0.77398,0.788842,0.746442,0.767408,0.768561,0.763541,0.698003,0.688265,...,0.770836,0.768135,0.806617,0.848377,0.802831,0.836928,0.815223,0.777062,0.778632,0.796503
5,1.411956,13.269906,6.235503,3.179088,3.010439,2.787644,9.194492,13.698144,5.685733,7.894817,...,3.048098,-3.278958,2.281926,-0.914983,1.961427,8.488854,3.767973,5.313927,2.161163,4.906433
6,23.166862,25.398351,28.600948,22.712478,23.365088,19.5314,54.270141,55.97262,24.986004,28.235562,...,4.664848,25.045652,17.466645,23.629604,17.666328,20.948782,18.037164,19.027706,27.049149,28.261994
7,8.267411,3.915929,5.277166,5.222266,6.881637,6.878625,2.581284,1.97987,4.279628,3.964802,...,6.077845,6.815854,7.475083,9.581172,6.95781,8.095411,6.577485,4.547092,9.273657,6.882528
8,1.727019,1.951211,1.81164,1.6735,1.990617,1.976653,1.871959,1.831581,1.856132,1.786941,...,1.959383,1.718242,1.679998,1.963774,1.80194,2.035428,1.443881,1.843346,1.753824,1.947699
9,1.96123,1.719263,1.965192,1.919052,1.86494,1.933425,1.808572,1.674458,1.99362,1.845084,...,2.016548,2.049981,1.96195,1.979332,1.908776,2.151065,1.963725,2.034864,1.955355,2.190959


In [14]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [15]:
topperF = make_topper(btF,'age', 'sex')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [17]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [18]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [19]:
back_together.head(3)

char,age,sex,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
sub-59082_1_ses-1_run-1,43.1726,F,0.6448,0.49387,0.309698,0.444971,0.785736,1.41196,23.1669,8.26741,1.72702,1.96123,1.73283,1.91241,29.6661,107.419,94.0369,75.9692,88.1936
sub-59083_1_ses-1_run-1,66.3671,F,0.60338,0.485493,0.417703,0.399662,0.72208,13.2699,25.3984,3.91593,1.95121,1.71926,2.54815,1.88459,23.7842,74.8468,63.8989,67.3784,68.2325
sub-59085_1_ses-1_run-1,55.8384,F,0.614763,0.537819,0.335135,0.413834,0.77398,6.2355,28.6009,5.27717,1.81164,1.96519,2.06088,1.94501,31.2081,80.5325,72.6961,60.6169,63.2598


In [20]:
neuro_harm_top =back_together.tail(len2)
neuro_harm_mri =back_together.head(len1)

In [21]:
neuro_harm_top = neuro_harm_top.rename(feature_dictF, axis='columns')
neuro_harm_mri = neuro_harm_mri.rename(feature_dictF, axis='columns')

In [22]:
neuro_harm_mri = neuro_harm_mri.reset_index()
neuro_harm_mri = neuro_harm_mri.rename(columns={"index": "participant_id"})
neuro_harm_mri

char,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.1726,F,0.6448,0.49387,0.309698,0.444971,0.785736,1.41196,23.1669,8.26741,1.72702,1.96123,1.73283,1.91241,29.6661,107.419,94.0369,75.9692,88.1936
1,sub-59083_1_ses-1_run-1,66.3671,F,0.60338,0.485493,0.417703,0.399662,0.72208,13.2699,25.3984,3.91593,1.95121,1.71926,2.54815,1.88459,23.7842,74.8468,63.8989,67.3784,68.2325
2,sub-59085_1_ses-1_run-1,55.8384,F,0.614763,0.537819,0.335135,0.413834,0.77398,6.2355,28.6009,5.27717,1.81164,1.96519,2.06088,1.94501,31.2081,80.5325,72.6961,60.6169,63.2598
3,sub-59086_1_ses-1_run-1,48.2384,F,0.631335,0.484823,0.298599,0.446481,0.788842,3.17909,22.7125,5.22227,1.6735,1.91905,2.01009,1.77987,24.5987,73.473,68.7029,53.0358,59.135
4,sub-59087_1_ses-1_run-1,58.6164,F,0.569735,0.455592,0.349758,0.414994,0.746442,3.01044,23.3651,6.88164,1.99062,1.86494,1.994,1.91309,8.13663,59.4175,47.9943,40.1043,43.6036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,sub-59440_1_ses-1_run-1,73.9288,M,0.568202,0.525958,0.514156,0.354034,0.680986,24.0069,56.7914,2.35391,2.03304,1.99906,1.93616,2.06748,31.4331,84.878,75.6891,55.4002,60.577
510,sub-59440_2_ses-2_run-1,74.7699,M,0.571078,0.508225,0.522324,0.356979,0.674683,21.0402,32.1103,2.43677,2.51283,2.56321,2.23657,2.55066,33.3507,84.9978,81.1471,69.2341,63.9709
511,sub-59441_2_ses-2_run-1,74.5123,M,0.58293,0.474644,0.430297,0.392828,0.712397,7.73817,30.5483,4.00171,1.86083,1.86375,1.72848,1.77118,29.4471,64.9848,64.2366,53.3877,54.036
512,sub-59442_1_ses-1_run-1,67.526,M,0.636604,0.551915,0.398417,0.402065,0.749047,9.71759,13.8773,5.1114,1.71496,1.96658,2.15072,1.87737,22.1107,71.3976,65.1419,57.5766,59.5793


Save off to harmonized csv files

In [23]:
neuro_harm_top = neuro_harm_top.reset_index()
neuro_harm_top = neuro_harm_top.rename(columns={"index": "participant_id"})
neuro_harm_top

char,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-0001_1_ses-1_run-1,43.49,M,0.71952,0.528887,0.316717,0.459496,0.797135,-2.12559,23.9865,8.5452,2.35546,2.3775,2.89592,2.50558,22.7813,80.8436,69.3116,53.3755,61.8034
1,sub-0002_1_ses-1_run-1,38.3,F,0.724281,0.623166,0.257423,0.451066,0.841799,2.76431,25.7155,8.85778,1.76755,2.47175,1.54464,1.98546,23.9142,87.8061,78.5087,64.3981,72.1006
2,sub-0019_1_ses-1_run-1,32.3,M,0.715623,0.533841,0.340081,0.450446,0.785656,-0.814774,7.37383,8.69763,2.01692,1.86886,2.26666,2.20422,26.7222,93.3029,85.2331,62.554,73.7348
3,sub-0020_1_ses-1_run-1,21.97,F,0.674118,0.435882,0.266362,0.491386,0.806686,6.16961,28.9281,6.16066,1.91819,1.91761,1.62326,1.86955,26.4638,99.8983,90.2131,68.5628,79.4252
4,sub-0022_1_ses-1_run-1,37.52,F,0.639961,0.470205,0.219086,0.480177,0.8344,3.77433,24.1586,7.30476,1.58491,2.0664,1.78956,1.77557,25.3379,87.3138,78.1519,68.7799,73.6165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,sub-1163_1_ses-1_run-1,19.06,F,0.78243,0.495592,0.252425,0.514005,0.836928,8.48885,20.9488,8.09541,2.03543,2.15107,2.42444,2.18453,27.8072,115.093,104.526,82.5102,94.1884
523,sub-1165_1_ses-1_run-1,33.86,F,0.716511,0.486692,0.27482,0.486336,0.815223,3.76797,18.0372,6.57748,1.44388,1.96373,1.40918,1.83534,25.4955,87.705,85.2498,66.6758,77.0005
524,sub-1167_1_ses-1_run-1,33.92,M,0.737243,0.536342,0.365677,0.450805,0.777062,5.31393,19.0277,4.54709,1.84335,2.03486,2.09195,1.9428,22.2298,70.6175,65.7655,45.9315,57.1123
525,sub-1168_1_ses-1_run-1,45.31,M,0.713133,0.609741,0.378094,0.419155,0.778632,2.16116,27.0491,9.27366,1.75382,1.95535,1.75132,1.77849,22.029,64.0643,59.3219,45.6106,53.9033


In [24]:
neuro_harm_mri.to_csv('newly/neuro_harm_mri.csv')
neuro_harm_top.to_csv('newly/neuro_harm_top.csv')

In [25]:
#neuro_harm_top.head(3)

In [26]:
mri_selection = our_mri_data
rename_dict = {
    'age': 'age_unharm', # will be the key on merge
    'sex':'sex_unharm',
    'gm_vol' :'gm_vol_unharm',
    'wm_vol' :'wm_vol_unharm',
    'csf_vol':'csf_vol_unharm',
    'gm_ivc_ratio': 'gm_ivc_ratio_unharm',
    'gmwm_ivc_ratio': 'gmwm_ivc_ratio_unharm',
    'wmh_vol' :'wmh_vol_unharm',
    'wmh_count' : 'wmh_count_unharm',
    'deepwm_b_cov':'deepwm_b_cov_unharm',
    'aca_b_cov': 'aca_b_cov_unharm',
    'mca_b_cov': 'mca_b_cov_unharm',
    'pca_b_cov': 'pca_b_cov_unharm',
    'totalgm_b_cov':'totalgm_b_cov_unharm',
    'deepwm_b':'deepwm_b_unharm',
    'aca_b':'aca_b_unharm',
    'mca_b':'mca_b_unharm',
    'pca_b':'pca_b_unharm',
    'totalgm_b':'totalgm_b_unharm', 

}
mri_selection = mri_selection.rename(columns = rename_dict)
mri_selection = mri_selection.merge(neuro_harm_mri, how= 'inner',)
mri_selection

Unnamed: 0.1,Unnamed: 0,participant_id,age_unharm,sex_unharm,gm_vol_unharm,wm_vol_unharm,csf_vol_unharm,gm_ivc_ratio_unharm,gmwm_ivc_ratio_unharm,wmh_vol_unharm,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,1,sub-59082_1_ses-1_run-1,43.172603,F,0.64754,0.49441,0.31320,0.44500,0.78476,1.249,...,8.26741,1.72702,1.96123,1.73283,1.91241,29.6661,107.419,94.0369,75.9692,88.1936
1,2,sub-59083_1_ses-1_run-1,66.367123,F,0.60517,0.48594,0.42304,0.39968,0.72061,14.597,...,3.91593,1.95121,1.71926,2.54815,1.88459,23.7842,74.8468,63.8989,67.3784,68.2325
2,4,sub-59085_1_ses-1_run-1,55.838356,F,0.61724,0.53779,0.33692,0.41371,0.77417,6.341,...,5.27717,1.81164,1.96519,2.06088,1.94501,31.2081,80.5325,72.6961,60.6169,63.2598
3,5,sub-59086_1_ses-1_run-1,48.238356,F,0.63402,0.48542,0.29969,0.44677,0.78882,3.070,...,5.22227,1.6735,1.91905,2.01009,1.77987,24.5987,73.473,68.7029,53.0358,59.135
4,6,sub-59087_1_ses-1_run-1,58.616438,F,0.57348,0.45642,0.35190,0.41502,0.74533,1.761,...,6.88164,1.99062,1.86494,1.994,1.91309,8.13663,59.4175,47.9943,40.1043,43.6036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,583,sub-59440_1_ses-1_run-1,73.928767,M,0.57259,0.52662,0.51982,0.35366,0.67893,26.949,...,2.35391,2.03304,1.99906,1.93616,2.06748,31.4331,84.878,75.6891,55.4002,60.577
510,584,sub-59440_2_ses-2_run-1,74.769863,M,0.57528,0.50907,0.52840,0.35671,0.67236,22.915,...,2.43677,2.51283,2.56321,2.23657,2.55066,33.3507,84.9978,81.1471,69.2341,63.9709
511,586,sub-59441_2_ses-2_run-1,74.512329,M,0.58675,0.47585,0.42899,0.39337,0.71239,5.249,...,4.00171,1.86083,1.86375,1.72848,1.77118,29.4471,64.9848,64.2366,53.3877,54.036
512,587,sub-59442_1_ses-1_run-1,67.526027,M,0.63929,0.55235,0.39661,0.40251,0.75028,8.612,...,5.1114,1.71496,1.96658,2.15072,1.87737,22.1107,71.3976,65.1419,57.5766,59.5793


In [27]:
top_selection = our_top_data

top_selection = top_selection.rename(columns = rename_dict)
top_selection = top_selection.merge(neuro_harm_top, how= 'inner',)
top_selection.columns

Index(['Unnamed: 0', 'participant_id', 'age_unharm', 'sex_unharm',
       'gm_vol_unharm', 'wm_vol_unharm', 'csf_vol_unharm',
       'gm_ivc_ratio_unharm', 'gmwm_ivc_ratio_unharm', 'wmh_vol_unharm',
       'wmh_count_unharm', 'deepwm_b_cov_unharm', 'aca_b_cov_unharm',
       'mca_b_cov_unharm', 'pca_b_cov_unharm', 'totalgm_b_cov_unharm',
       'deepwm_b_unharm', 'aca_b_unharm', 'mca_b_unharm', 'pca_b_unharm',
       'totalgm_b_unharm', 'age', 'sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b'],
      dtype='object')

In [28]:
top_selection.to_csv('top_before_after_neurocombat2way.csv')
mri_selection.to_csv('mri_before_after_neurocombat2way.csv')

Here we will also save off a version without white matter columns...optional

In [29]:
# neuro_harm_mri_less = neuro_harm_mri.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_top_less = neuro_harm_top.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_mri_less.to_csv('less_neuro_harm_mri_mon.csv')
# neuro_harm_top_less.to_csv('less_neuro_harm_top_mon.csv')

# Investigate neurocombat results

In [30]:
neuro_harm_mri.head(3) 

char,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.1726,F,0.6448,0.49387,0.309698,0.444971,0.785736,1.41196,23.1669,8.26741,1.72702,1.96123,1.73283,1.91241,29.6661,107.419,94.0369,75.9692,88.1936
1,sub-59083_1_ses-1_run-1,66.3671,F,0.60338,0.485493,0.417703,0.399662,0.72208,13.2699,25.3984,3.91593,1.95121,1.71926,2.54815,1.88459,23.7842,74.8468,63.8989,67.3784,68.2325
2,sub-59085_1_ses-1_run-1,55.8384,F,0.614763,0.537819,0.335135,0.413834,0.77398,6.2355,28.6009,5.27717,1.81164,1.96519,2.06088,1.94501,31.2081,80.5325,72.6961,60.6169,63.2598


## We see neuroharmoney added negative numbers 


## let's see how many negative numbers we have

In [31]:
number_columns = ['gm_vol',
                  'wm_vol',
                  'csf_vol',
                  'gm_ivc_ratio', 
                  'gmwm_ivc_ratio',
                  'wmh_vol',
                  'wmh_count',
                  'deepwm_b_cov',
                  'aca_b_cov',
                  'mca_b_cov',
                  'pca_b_cov',
                  'totalgm_b_cov',
                  'deepwm_b',
                  'aca_b',
                  'mca_b',
                  'pca_b',
                  'totalgm_b',]

In [32]:
(neuro_harm_mri[number_columns] < 0).sum()

char
gm_vol            0
wm_vol            0
csf_vol           0
gm_ivc_ratio      0
gmwm_ivc_ratio    0
wmh_vol           5
wmh_count         0
deepwm_b_cov      0
aca_b_cov         0
mca_b_cov         0
pca_b_cov         0
totalgm_b_cov     0
deepwm_b          0
aca_b             0
mca_b             0
pca_b             0
totalgm_b         0
dtype: int64

In [33]:
#our_top_data.head(3)
big_top = our_top_data.rename(columns={"Unnamed: 0": "level_0"})
big_top.head(3)

Unnamed: 0,level_0,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,0,sub-0001_1_ses-1_run-1,43.49,M,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,...,8.762,2.2748,2.3205,2.6858,2.4316,22.1022,80.4744,68.3224,52.4614,60.6981
1,1,sub-0002_1_ses-1_run-1,38.3,F,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,...,9.0749,1.7564,2.3989,1.5982,1.9738,23.5401,87.3972,78.0359,63.9932,71.6047
2,2,sub-0019_1_ses-1_run-1,32.3,M,0.71224,0.53295,0.33594,0.45046,0.78753,0.621,...,8.8791,1.985,1.8702,2.1648,2.1723,27.5573,94.0855,86.3816,62.6012,74.0588


In [34]:
new_harm_top = neuro_harm_top.reset_index()
new_harm_top = new_harm_top.reset_index()
new_harm_top = new_harm_top.rename(columns={"index": "participant_id"})
new_harm_top.head(3) 

char,level_0,participant_id,participant_id.1,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,...,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,0,0,sub-0001_1_ses-1_run-1,43.49,M,0.71952,0.528887,0.316717,0.459496,0.797135,...,8.5452,2.35546,2.3775,2.89592,2.50558,22.7813,80.8436,69.3116,53.3755,61.8034
1,1,1,sub-0002_1_ses-1_run-1,38.3,F,0.724281,0.623166,0.257423,0.451066,0.841799,...,8.85778,1.76755,2.47175,1.54464,1.98546,23.9142,87.8061,78.5087,64.3981,72.1006
2,2,2,sub-0019_1_ses-1_run-1,32.3,M,0.715623,0.533841,0.340081,0.450446,0.785656,...,8.69763,2.01692,1.86886,2.26666,2.20422,26.7222,93.3029,85.2331,62.554,73.7348


In [35]:
# difference in harmonized and original - raw
raw_differences = big_top[number_columns] - new_harm_top[number_columns]

In [36]:
# difference in harmonized and original - raw
differences = (big_top[number_columns] - new_harm_top[number_columns])/big_top[number_columns]
differences

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,-0.00301165,-0.00162292,0.00441086,-0.00149622,-0.000759281,2.2195,0.000564052,0.0247427,-0.035458,-0.0245622,-0.078234,-0.0304258,-0.030726,-0.00458798,-0.0144784,-0.0174241,-0.0182098
1,-0.000623199,0.00124082,-0.00269746,0.000119967,-0.00215402,-0.696938,-0.118066,0.0239251,-0.00634918,-0.0303688,0.0335139,-0.00590664,-0.0158912,-0.0046787,-0.00605848,-0.0063271,-0.00692557
2,-0.0047496,-0.00167174,-0.012326,3.15885e-05,0.00237946,2.31204,0.432782,0.0204375,-0.0160808,0.000716967,-0.0470551,-0.0146946,0.0303028,0.00831748,0.0132955,0.000754774,0.00437463
3,-0.00551572,-0.00299617,-0.0254955,-0.000298177,0.00369795,-7.38262,-0.257745,0.00762548,-0.0281351,-0.0044051,0.0103302,0.00205405,0.0263965,0.00426204,0.00574681,-0.00707352,-0.00394881
4,-0.00518445,-0.00175865,0.01063,-0.00120237,-0.00145256,-0.878711,-0.0981182,0.0202974,0.0124567,-0.0137864,0.00263926,0.0100509,0.00521163,-0.00595967,-0.00766015,0.00379473,-0.00364357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,-0.000536995,-0.00141893,-0.0257838,-0.00104171,0.00177971,-4.93211,-0.163821,0.0139933,-0.0385895,-0.0158034,-0.0627915,-0.0190938,0.0421122,0.0202971,0.0274703,0.0192754,0.0234143
523,-0.00159474,-0.00140266,-0.0132375,-0.00114426,0.00073157,-1.64419,-0.00206467,0.0162008,0.0277551,-0.00822766,0.0500337,0.00507235,0.00905156,-0.0074815,0.00529177,-0.00364463,0.00132897
524,-0.00342075,-0.00159199,-0.016194,-0.0001453,0.00279452,-0.350426,0.0486147,0.00425009,-7.89542e-05,-0.00855684,-0.0323989,0.0026685,-0.0351018,-0.0333512,-0.036641,-0.058625,-0.049465
525,-0.00308404,-9.8957e-07,-0.00878766,0.000369709,0.000395451,0.481735,-0.0403519,0.0265615,0.0145951,-0.00563408,0.00887288,0.0161028,-0.0457769,-0.042332,-0.047702,-0.047684,-0.0478526


In [37]:
differences_n = differences.apply(pd.to_numeric) #

In [38]:
differences_n.describe()

Unnamed: 0,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
count,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0,527.0
mean,-0.003688,-0.001217,-0.005948,-0.000413,0.000694,-0.114124,0.029381,0.01981,-0.005445,-0.009517,0.001111,0.000983,-0.011692,-0.006421,-0.006875,-0.00965,-0.009185
std,0.002479,0.001143,0.014338,0.000919,0.0022,2.152767,0.229011,0.005084,0.020672,0.008924,0.030527,0.015002,0.030732,0.01852,0.022966,0.02339,0.023877
min,-0.018418,-0.005539,-0.048641,-0.002866,-0.004381,-10.304156,-0.406924,-0.00719,-0.13634,-0.062733,-0.102817,-0.074601,-0.14549,-0.076357,-0.095619,-0.092636,-0.100245
25%,-0.00525,-0.001992,-0.016196,-0.001052,-0.00081,-1.165285,-0.115314,0.016811,-0.019623,-0.015485,-0.018708,-0.007271,-0.030908,-0.018007,-0.018784,-0.02014,-0.02281
50%,-0.003522,-0.001109,-0.00653,-0.000454,0.000593,0.017174,-0.014027,0.020368,-0.004971,-0.00939,0.000484,0.000616,-0.011679,-0.004679,-0.005327,-0.007397,-0.006956
75%,-0.00207,-0.000467,0.003579,0.000182,0.002093,0.884334,0.129524,0.023441,0.00747,-0.004347,0.019804,0.010239,0.007675,0.005928,0.008483,0.00557,0.006175
max,0.002991,0.001568,0.040428,0.002582,0.008176,9.156304,1.564713,0.031013,0.050122,0.01757,0.098147,0.045802,0.23959,0.09294,0.123383,0.111142,0.125796


## Now we can use the versions of TOP and StrokeMRI with log base 10 revision of white matter hyperintensity count, as well as white matter hyperintensity  volume

In [39]:
log_filepath = '../open_work/internal_results/loggy/' 
# this needs to be redone, due to bad column
log_filename_mri = os.path.join(log_filepath,'stroke_loged_mon.csv') 
log_filename_top = os.path.join(log_filepath,'top_loged_mon.csv') 

In [40]:
#log_mri_data = pd.read_csv(log_filename_mri)
#log_top_data = pd.read_csv(log_filename_top)

In [41]:
log_both_togetherF, log_ftF, log_btF, log_feature_dictF, log_len1, log_len2 = prep_for_neurocombat(
    log_filename_mri,
    log_filename_top)

Nan count 0


In [42]:
# # make and save of csv of features only
# log_features_only = log_both_together[2:]
# #log_features_only.to_csv('log_features_only_top_mri.csv')

In [43]:
log_ftF.to_csv('log_ftF_top_mri.csv')

In [44]:
log_data = np.genfromtxt('log_ftF_top_mri.csv', delimiter=",", skip_header=1)
log_data = log_data[:, 1:]

In [45]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * log_len1
last_columns_as_two = [2] * log_len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':log_both_togetherF.loc['sex',:].values.tolist(),
           'age':log_both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [46]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
log_data_combat = neuroCombat(dat=log_data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data


In [47]:
log_data_combat

array([[ 0.64449499,  0.60309953,  0.61446461, ...,  0.73754416,
         0.71342868,  0.68197516],
       [ 0.49376132,  0.48539222,  0.53776202, ...,  0.53648246,
         0.60980237,  0.50956537],
       [ 0.30942313,  0.41746153,  0.33482719, ...,  0.36584347,
         0.37834378,  0.30671911],
       ...,
       [93.98795907, 63.8713036 , 72.6636666 , ..., 65.79148006,
        59.3449098 , 72.63062126],
       [75.91918176, 67.32953866, 60.5734431 , ..., 45.96483281,
        45.647413  , 49.13350886],
       [88.130303  , 68.18100275, 63.21808284, ..., 57.14692329,
        53.9397854 , 60.87184279]])

In [48]:
log_neurocombat = pd.DataFrame(log_data_combat)
#log_neurocombat

In [49]:
log_topperF = make_topper(log_btF,'age', 'sex')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [50]:
log_bottom = log_neurocombat.reset_index(drop=False)
log_bottom = log_bottom.rename(columns={"index": "char"})
log_bottom.columns = log_topperF.columns
#log_bottom

In [51]:
log_back_together = pd.concat([log_topperF, log_bottom])
log_back_together = log_back_together.T
#log_back_together

In [52]:
log_new_header = log_back_together.iloc[0] #grab the first row for the header
log_back_together.columns = new_header #set the header row as the df header
log_back_together = log_back_together[1:]
#log_back_together

In [53]:
log_back_together.head(514).tail(5)

char,age,sex,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
sub-59440_1_ses-1_run-1,73.9288,M,0.567853,0.525838,0.513921,0.353962,0.353962,1.37438,1.75962,2.36636,2.03109,1.99676,1.93506,2.06721,31.3437,84.8332,75.6423,55.3532,60.5251
sub-59440_2_ses-2_run-1,74.7699,M,0.570734,0.508089,0.522097,0.356913,0.356913,1.30746,1.49619,2.44944,2.51175,2.56203,2.23601,2.55119,33.2351,84.9529,81.0945,69.1793,63.915
sub-59441_2_ses-2_run-1,74.5123,M,0.582596,0.474475,0.429932,0.392803,0.392803,0.693759,1.47151,4.01651,1.85856,1.86118,1.727,1.77042,29.3856,64.9445,64.201,53.3417,53.9909
sub-59442_1_ses-1_run-1,67.526,M,0.6363,0.551816,0.398042,0.402036,0.402036,0.895062,1.01007,5.12685,1.71249,1.96419,2.15009,1.87679,22.1478,71.3565,65.1082,57.5294,59.5312
sub-59442_2_ses-2_run-1,68.411,M,0.625823,0.537881,0.429864,0.393259,0.393259,0.911419,1.2049,4.40793,1.73859,2.08405,2.11425,1.92714,28.4182,84.3031,72.0808,60.4148,66.2547


In [54]:
log_back_together.tail(527).head(5)

char,age,sex,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
sub-0001_1_ses-1_run-1,43.49,M,0.719816,0.529027,0.317123,0.459504,0.459504,0.272223,1.39129,8.53128,2.35505,2.3787,2.89034,2.50363,22.7985,80.8915,69.348,53.421,61.8507
sub-0002_1_ses-1_run-1,38.3,F,0.724552,0.623156,0.257759,0.451124,0.451124,0.252181,1.38316,8.84386,1.76924,2.4725,1.54715,1.98541,23.9339,87.8538,78.5492,64.4474,72.1528
sub-0019_1_ses-1_run-1,32.3,M,0.715936,0.533984,0.340303,0.450501,0.450501,-0.187427,1.08258,8.68361,2.01798,1.87172,2.26441,2.20356,26.7521,93.3586,85.2864,62.6066,73.7936
sub-0020_1_ses-1_run-1,21.97,F,0.674436,0.436063,0.266502,0.491431,0.491431,-0.096069,1.39454,6.14624,1.91863,1.92019,1.62465,1.86998,26.4927,99.9516,90.2614,68.6115,79.4789
sub-0022_1_ses-1_run-1,37.52,F,0.640273,0.470343,0.219524,0.480192,0.480192,0.347782,1.36076,7.29064,1.58753,2.06842,1.79058,1.77644,25.3618,87.3607,78.1915,68.8341,73.6705


In [55]:
log_neuro_harm_top =log_back_together.tail(log_len2)
log_neuro_harm_mri =log_back_together.head(log_len1)

In [56]:
log_neuro_harm_top = log_neuro_harm_top.rename(log_feature_dictF, axis='columns')
log_neuro_harm_mri = log_neuro_harm_mri.rename(log_feature_dictF, axis='columns')

In [57]:
(log_neuro_harm_mri[number_columns] < 0).sum()

char
gm_vol            0
wm_vol            0
csf_vol           0
gm_ivc_ratio      0
gmwm_ivc_ratio    0
wmh_vol           7
wmh_count         0
deepwm_b_cov      0
aca_b_cov         0
mca_b_cov         0
pca_b_cov         0
totalgm_b_cov     0
deepwm_b          0
aca_b             0
mca_b             0
pca_b             0
totalgm_b         0
dtype: int64

In [58]:
## There are still negative numbers in the outcome...but there is a difference

In [59]:
# log_neuro_harm_mri.to_csv('log_neuro_harm_mri_mon.csv')
# log_neuro_harm_top.to_csv('log_neuro_harm_top_mon.csv')

In [60]:
log_neuro_harm_mri.head(3)

char,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
sub-59082_1_ses-1_run-1,43.1726,F,0.644495,0.493761,0.309423,0.44492,0.44492,0.0679838,1.36557,8.28438,1.7251,1.95885,1.73167,1.91203,29.5924,107.373,93.988,75.9192,88.1303
sub-59083_1_ses-1_run-1,66.3671,F,0.6031,0.485392,0.417462,0.399611,0.399611,1.10743,1.3917,3.92982,1.94948,1.71649,2.54819,1.88415,23.8036,74.8062,63.8713,67.3295,68.181
sub-59085_1_ses-1_run-1,55.8384,F,0.614465,0.537762,0.334827,0.413775,0.413775,0.75307,1.45628,5.2916,1.80976,1.96287,2.06017,1.94468,31.1191,80.4915,72.6637,60.5734,63.2181


In [61]:
neuro_harm_mri.head(3)

char,participant_id,age,sex,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
0,sub-59082_1_ses-1_run-1,43.1726,F,0.6448,0.49387,0.309698,0.444971,0.785736,1.41196,23.1669,8.26741,1.72702,1.96123,1.73283,1.91241,29.6661,107.419,94.0369,75.9692,88.1936
1,sub-59083_1_ses-1_run-1,66.3671,F,0.60338,0.485493,0.417703,0.399662,0.72208,13.2699,25.3984,3.91593,1.95121,1.71926,2.54815,1.88459,23.7842,74.8468,63.8989,67.3784,68.2325
2,sub-59085_1_ses-1_run-1,55.8384,F,0.614763,0.537819,0.335135,0.413834,0.77398,6.2355,28.6009,5.27717,1.81164,1.96519,2.06088,1.94501,31.2081,80.5325,72.6961,60.6169,63.2598


In [62]:
dif_log_to_reg = log_neuro_harm_mri[number_columns] - neuro_harm_mri[number_columns]
dif_log_to_reg 

char,gm_vol,wm_vol,csf_vol,gm_ivc_ratio,gmwm_ivc_ratio,wmh_vol,wmh_count,deepwm_b_cov,aca_b_cov,mca_b_cov,pca_b_cov,totalgm_b_cov,deepwm_b,aca_b,mca_b,pca_b,totalgm_b
sub-59082_1_ses-1_run-1,,,,,,,,,,,,,,,,,
sub-59083_1_ses-1_run-1,,,,,,,,,,,,,,,,,
sub-59085_1_ses-1_run-1,,,,,,,,,,,,,,,,,
sub-59086_1_ses-1_run-1,,,,,,,,,,,,,,,,,
sub-59087_1_ses-1_run-1,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,,,,,,,,,,,,,,,,,
510,,,,,,,,,,,,,,,,,
511,,,,,,,,,,,,,,,,,
512,,,,,,,,,,,,,,,,,


In [63]:
dif_log_to_reg.sum().sum()

0.0

# So changing the two columns to theirlog made la difference in the harmarmonized outcomes...now let's look at how this plays out when we make models...