# Neurocombat TOP and StrokeMRI harmonized datasets

Note this must be run in the `neuron` environment

## import libraries

In [1]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [2]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 

In [3]:
def prep_for_neurocombat(dataframename1, dataframename2):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    our_1_frame = pd.read_csv(dataframename1)
    our_2_frame = pd.read_csv(dataframename2)
    two_selection = our_2_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = our_1_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [4]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat(filename_mri, filename_top)

Nan count 0


In [5]:
our_mri_data = pd.read_csv(filename_mri)
our_top_data = pd.read_csv(filename_top)

In [6]:
# # save off csv
# both_together.to_csv('both_top_mri_together.csv')

In [7]:
# # make and save of csv of features only
# features_only = both_together[2:]
# #features_only.to_csv('features_only_top_mri.csv')

In [8]:
ftF.to_csv('ftF_top_mri.csv')

In [9]:
data = np.genfromtxt('ftF_top_mri.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

array([[ 0.64754,  0.60517,  0.61724, ...,  0.73473,  0.71094,  0.67941],
       [ 0.49441,  0.48594,  0.53779, ...,  0.53549,  0.60974,  0.50902],
       [ 0.3132 ,  0.42304,  0.33692, ...,  0.35985,  0.3748 ,  0.30031],
       ...,
       [90.5047 , 65.3032 , 72.998  , ..., 63.441  , 56.621  , 70.7366 ],
       [73.8758 , 65.6771 , 60.8317 , ..., 43.3879 , 43.5347 , 46.0536 ],
       [84.4527 , 67.1158 , 64.3183 , ..., 54.4204 , 51.4417 , 57.9078 ]])

In [10]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [11]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    categorical_cols=categorical_cols)["data"]

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data


In [12]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040
0,0.67657,0.636007,0.647562,0.663626,0.605668,0.680628,0.626644,0.613528,0.577465,0.563621,...,0.614335,0.633073,0.717369,0.745197,0.733885,0.75739,0.687591,0.705082,0.680164,0.649927
1,0.502153,0.493854,0.544655,0.493344,0.464931,0.522262,0.435998,0.43011,0.442327,0.44364,...,0.528495,0.555632,0.515374,0.565981,0.505001,0.487033,0.47797,0.527127,0.602906,0.501453
2,0.274844,0.368881,0.295151,0.263278,0.307976,0.31963,0.269895,0.272079,0.372366,0.384711,...,0.398009,0.421258,0.354513,0.26721,0.352634,0.284442,0.315811,0.409197,0.427843,0.352081
3,0.464791,0.423664,0.436396,0.466397,0.437585,0.447869,0.467296,0.46314,0.412475,0.40281,...,0.397182,0.391029,0.453789,0.468496,0.461567,0.498028,0.466881,0.429521,0.394168,0.4338
4,0.464791,0.423664,0.436396,0.466397,0.437585,0.447869,0.467296,0.46314,0.412475,0.40281,...,0.397182,0.391029,0.453789,0.468496,0.461567,0.498028,0.466881,0.429521,0.394168,0.4338
5,-0.741153,8.89553,2.935053,0.573531,-0.371511,0.555642,5.581747,9.718561,1.179975,3.111932,...,9.757454,0.02638,6.762887,-1.375663,5.365667,7.681578,7.662372,7.203047,7.955285,9.576576
6,21.019972,21.780841,25.585188,20.259102,20.259102,17.354338,50.693879,52.215618,21.019972,24.063449,...,7.743664,30.197013,20.376256,26.454788,20.841451,20.376256,20.376256,20.841451,32.068126,31.602931
7,8.94017,5.2307,6.401311,6.379128,7.73372,7.780655,4.139946,3.629495,5.516807,5.245837,...,4.642914,5.574062,6.377191,9.005146,5.725829,7.077788,5.235671,2.663893,8.657506,5.620045
8,1.67901,1.881776,1.75289,1.62341,1.925444,1.925,1.809928,1.768884,1.785217,1.716077,...,2.031684,1.777919,1.736658,2.02516,1.859267,2.091085,1.483916,1.894825,1.813732,2.016132
9,1.972876,1.733382,1.979178,1.931316,1.878704,1.94488,1.82189,1.687255,2.010782,1.861623,...,2.000205,2.033181,1.947493,1.965557,1.894592,2.139222,1.949944,2.02126,1.939711,2.175206


In [13]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [14]:
topperF = make_topper(btF,'age', 'sex')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [16]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [17]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [18]:
neuro_harm_top =back_together.tail(len2)
neuro_harm_mri =back_together.head(len1)

In [19]:
neuro_harm_top = neuro_harm_top.rename(feature_dictF, axis='columns')
neuro_harm_mri = neuro_harm_mri.rename(feature_dictF, axis='columns')

In [None]:
Save off to harmonized csv files

In [20]:
# neuro_harm_mri.to_csv('neuro_harm_mri_mon.csv')
# neuro_harm_top.to_csv('neuro_harm_top_mon.csv')

Here we will also save off a version without white matter columns...optional

In [24]:
# neuro_harm_mri_less = neuro_harm_mri.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_top_less = neuro_harm_top.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_mri_less.to_csv('less_neuro_harm_mri_mon.csv')
# neuro_harm_top_less.to_csv('less_neuro_harm_top_mon.csv')

# Investigate neurocombat results

In [None]:
neuro_harm_mri.head(3) 

## We see neuroharmoney added negative numbers 


## let's see how many negative numbers we have

In [None]:
number_columns = ['gm_vol',
                  'wm_vol',
                  'csf_vol',
                  'gm_ivc_ratio', 
                  'gmwm_ivc_ratio',
                  'wmh_vol',
                  'wmh_count',
                  'deepwm_b_cov',
                  'aca_b_cov',
                  'mca_b_cov',
                  'pca_b_cov',
                  'totalgm_b_cov',
                  'deepwm_b',
                  'aca_b',
                  'mca_b',
                  'pca_b',
                  'totalgm_b',]

In [None]:
(neuro_harm_mri[number_columns] < 0).sum()

In [None]:
#our_top_data.head(3)
big_top = our_top_data.rename(columns={"Unnamed: 0": "level_0"})
big_top.head(3)

In [None]:
new_harm_top = neuro_harm_top.reset_index()
new_harm_top = new_harm_top.reset_index()
new_harm_top = new_harm_top.rename(columns={"index": "participant_id"})
new_harm_top.head(3) 

In [None]:
# difference in harmonized and original - raw
raw_differences = big_top[number_columns] - new_harm_top[number_columns]

In [None]:
# difference in harmonized and original - raw
differences = (big_top[number_columns] - new_harm_top[number_columns])/big_top[number_columns]
differences

In [None]:
differences_n = differences.apply(pd.to_numeric) #

In [None]:
differences_n.describe()

## Now we can use the versions of TOP and StrokeMRI with log base 10 revision of white matter hyperintensity count, as well as white matter hyperintensity  volume

In [None]:
log_filepath = '../open_work/internal_results/loggy/' 
log_filename_mri = os.path.join(log_filepath,'stroke_loged_mon.csv') 
log_filename_top = os.path.join(log_filepath,'top_loged_mon.csv') 

In [None]:
log_mri_data = pd.read_csv(log_filename_mri)
log_top_data = pd.read_csv(log_filename_top)

In [None]:
log_both_togetherF, log_ftF, log_btF, log_feature_dictF, log_len1, log_len2 = prep_for_neurocombat(
    log_filename_mri,
    log_filename_top)

In [None]:
# # make and save of csv of features only
# log_features_only = log_both_together[2:]
# #log_features_only.to_csv('log_features_only_top_mri.csv')

In [None]:

log_ftF.to_csv('log_ftF_top_mri.csv')

In [None]:
log_data = np.genfromtxt('log_ftF_top_mri.csv', delimiter=",", skip_header=1)
log_data = log_data[:, 1:]

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * log_len1
last_columns_as_two = [2] * log_len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':log_both_togetherF.loc['sex',:].values.tolist(),
           'age':log_both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
log_data_combat = neuroCombat(dat=log_data,
    covars=covars,
    batch_col=batch_col_mine,
    categorical_cols=categorical_cols)["data"]

In [None]:
log_data_combat

In [None]:
log_neurocombat = pd.DataFrame(log_data_combat)
#log_neurocombat

In [None]:
log_topperF = make_topper(log_btF,'age', 'sex')

In [None]:
log_bottom = log_neurocombat.reset_index(drop=False)
log_bottom = log_bottom.rename(columns={"index": "char"})
log_bottom.columns = log_topperF.columns
#log_bottom

In [None]:
log_back_together = pd.concat([log_topperF, log_bottom])
log_back_together = log_back_together.T
#log_back_together

In [None]:
log_new_header = log_back_together.iloc[0] #grab the first row for the header
log_back_together.columns = new_header #set the header row as the df header
log_back_together = log_back_together[1:]
#log_back_together

In [None]:
log_back_together.head(514).tail(5)

In [None]:
log_back_together.tail(527).head(5)

In [None]:
log_neuro_harm_top =log_back_together.tail(log_len2)
log_neuro_harm_mri =log_back_together.head(log_len1)

In [None]:
log_neuro_harm_top = log_neuro_harm_top.rename(log_feature_dictF, axis='columns')
log_neuro_harm_mri = log_neuro_harm_mri.rename(log_feature_dictF, axis='columns')

In [None]:
(log_neuro_harm_mri[number_columns] < 0).sum()

In [None]:
## There are still negative numbers in the outcome...but there is a difference

In [None]:
# log_neuro_harm_mri.to_csv('log_neuro_harm_mri_mon.csv')
# log_neuro_harm_top.to_csv('log_neuro_harm_top_mon.csv')

In [None]:
log_neuro_harm_mri.head(3)

In [None]:
neuro_harm_mri.head(3)

In [None]:
dif_log_to_reg = log_neuro_harm_mri[number_columns] - neuro_harm_mri[number_columns]
dif_log_to_reg 

In [None]:
dif_log_to_reg.sum().sum()

# So changing the two columns to theirlog made la difference in the harmarmonized outcomes...now let's look at how this plays out when we make models...