# Neurocombat TOP and StrokeMRI harmonized datasets

Note this must be run in the `neuron` environment

## import libraries

In [None]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 

In [None]:
#pd.read_csv(filename_mri)

In [None]:
def prep_for_neurocombat(dataframename1, dataframename2):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    our_1_frame = pd.read_csv(dataframename1)
    our_2_frame = pd.read_csv(dataframename2)
    two_selection = our_2_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = our_1_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat(filename_mri, filename_top)

In [None]:
our_mri_data = pd.read_csv(filename_mri)
our_top_data = pd.read_csv(filename_top)

In [None]:
# # save off csv
# both_together.to_csv('both_top_mri_together.csv')

In [None]:
# # make and save of csv of features only
# features_only = both_together[2:]
# #features_only.to_csv('features_only_top_mri.csv')

In [None]:
ftF.to_csv('ftF_top_mri.csv')

In [None]:
data = np.genfromtxt('ftF_top_mri.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [None]:
topperF = make_topper(btF,'age', 'sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
back_together.head(3)

In [None]:
neuro_harm_top =back_together.tail(len2)
neuro_harm_mri =back_together.head(len1)

In [None]:
neuro_harm_top = neuro_harm_top.rename(feature_dictF, axis='columns')
neuro_harm_mri = neuro_harm_mri.rename(feature_dictF, axis='columns')

In [None]:
neuro_harm_mri = neuro_harm_mri.reset_index()
neuro_harm_mri = neuro_harm_mri.rename(columns={"index": "participant_id"})
neuro_harm_mri

Save off to harmonized csv files

In [None]:
neuro_harm_top = neuro_harm_top.reset_index()
neuro_harm_top = neuro_harm_top.rename(columns={"index": "participant_id"})
neuro_harm_top

In [None]:
neuro_harm_mri.to_csv('newly/neuro_harm_mri.csv')
neuro_harm_top.to_csv('newly/neuro_harm_top.csv')

In [None]:
#neuro_harm_top.head(3)

Here we will also save off a version without white matter columns...optional

In [None]:
# neuro_harm_mri_less = neuro_harm_mri.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_top_less = neuro_harm_top.drop(columns=['wmh_vol', 'wmh_count'])
# neuro_harm_mri_less.to_csv('less_neuro_harm_mri_mon.csv')
# neuro_harm_top_less.to_csv('less_neuro_harm_top_mon.csv')

# Investigate neurocombat results

In [None]:
neuro_harm_mri.head(3) 

## We see neuroharmoney added negative numbers 


## let's see how many negative numbers we have

In [None]:
number_columns = ['gm_vol',
                  'wm_vol',
                  'csf_vol',
                  'gm_ivc_ratio', 
                  'gmwm_ivc_ratio',
                  'wmh_vol',
                  'wmh_count',
                  'deepwm_b_cov',
                  'aca_b_cov',
                  'mca_b_cov',
                  'pca_b_cov',
                  'totalgm_b_cov',
                  'deepwm_b',
                  'aca_b',
                  'mca_b',
                  'pca_b',
                  'totalgm_b',]

In [None]:
(neuro_harm_mri[number_columns] < 0).sum()

In [None]:
#our_top_data.head(3)
big_top = our_top_data.rename(columns={"Unnamed: 0": "level_0"})
big_top.head(3)

In [None]:
new_harm_top = neuro_harm_top.reset_index()
new_harm_top = new_harm_top.reset_index()
new_harm_top = new_harm_top.rename(columns={"index": "participant_id"})
new_harm_top.head(3) 

In [None]:
# difference in harmonized and original - raw
raw_differences = big_top[number_columns] - new_harm_top[number_columns]

In [None]:
# difference in harmonized and original - raw
differences = (big_top[number_columns] - new_harm_top[number_columns])/big_top[number_columns]
differences

In [None]:
differences_n = differences.apply(pd.to_numeric) #

In [None]:
differences_n.describe()

## Now we can use the versions of TOP and StrokeMRI with log base 10 revision of white matter hyperintensity count, as well as white matter hyperintensity  volume

In [None]:
log_filepath = '../open_work/internal_results/loggy/' 
log_filename_mri = os.path.join(log_filepath,'stroke_loged_mon.csv') 
log_filename_top = os.path.join(log_filepath,'top_loged_mon.csv') 

In [None]:
log_mri_data = pd.read_csv(log_filename_mri)
log_top_data = pd.read_csv(log_filename_top)

In [None]:
log_both_togetherF, log_ftF, log_btF, log_feature_dictF, log_len1, log_len2 = prep_for_neurocombat(
    log_filename_mri,
    log_filename_top)

In [None]:
# # make and save of csv of features only
# log_features_only = log_both_together[2:]
# #log_features_only.to_csv('log_features_only_top_mri.csv')

In [None]:
log_ftF.to_csv('log_ftF_top_mri.csv')

In [None]:
log_data = np.genfromtxt('log_ftF_top_mri.csv', delimiter=",", skip_header=1)
log_data = log_data[:, 1:]

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * log_len1
last_columns_as_two = [2] * log_len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':log_both_togetherF.loc['sex',:].values.tolist(),
           'age':log_both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
log_data_combat = neuroCombat(dat=log_data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

In [None]:
log_data_combat

In [None]:
log_neurocombat = pd.DataFrame(log_data_combat)
#log_neurocombat

In [None]:
log_topperF = make_topper(log_btF,'age', 'sex')

In [None]:
log_bottom = log_neurocombat.reset_index(drop=False)
log_bottom = log_bottom.rename(columns={"index": "char"})
log_bottom.columns = log_topperF.columns
#log_bottom

In [None]:
log_back_together = pd.concat([log_topperF, log_bottom])
log_back_together = log_back_together.T
#log_back_together

In [None]:
log_new_header = log_back_together.iloc[0] #grab the first row for the header
log_back_together.columns = new_header #set the header row as the df header
log_back_together = log_back_together[1:]
#log_back_together

In [None]:
log_back_together.head(514).tail(5)

In [None]:
log_back_together.tail(527).head(5)

In [None]:
log_neuro_harm_top =log_back_together.tail(log_len2)
log_neuro_harm_mri =log_back_together.head(log_len1)

In [None]:
log_neuro_harm_top = log_neuro_harm_top.rename(log_feature_dictF, axis='columns')
log_neuro_harm_mri = log_neuro_harm_mri.rename(log_feature_dictF, axis='columns')

In [None]:
(log_neuro_harm_mri[number_columns] < 0).sum()

In [None]:
## There are still negative numbers in the outcome...but there is a difference

In [None]:
# log_neuro_harm_mri.to_csv('log_neuro_harm_mri_mon.csv')
# log_neuro_harm_top.to_csv('log_neuro_harm_top_mon.csv')

In [None]:
log_neuro_harm_mri.head(3)

In [None]:
neuro_harm_mri.head(3)

In [None]:
dif_log_to_reg = log_neuro_harm_mri[number_columns] - neuro_harm_mri[number_columns]
dif_log_to_reg 

In [None]:
dif_log_to_reg.sum().sum()

# So changing the two columns to theirlog made la difference in the harmarmonized outcomes...now let's look at how this plays out when we make models...