# Neurocombat TOP and StrokeMRI harmonized datasets

Note this must be run in the `neuron` environment

## import libraries

In [None]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 

In [None]:
our_mri_data = pd.read_csv(filename_mri)
our_top_data = pd.read_csv(filename_top)

In [None]:
print("Our StrokeMRI data is", len(our_mri_data), "patients")
print("Our TOP data is", len(our_top_data), "patients")

In [None]:
our_top_data.head(3)

In [None]:
our_mri_data.head(3)

In [None]:
top_selection = our_top_data.drop(['Unnamed: 0'],axis=1)
mri_selection = our_mri_data.drop(['Unnamed: 0'],axis=1)

In [None]:
#mri_selection.isna().sum()
#top_selection.isna().sum()

In [None]:
mri_selection = mri_selection.set_index('participant_id')
top_selection = top_selection.set_index('participant_id')

In [None]:
top_selection = top_selection.T
mri_selection = mri_selection.T
#mri_selection

In [None]:
both_together = pd.concat([ mri_selection, top_selection], axis=1, join="inner")
both_together

In [None]:
# make sure there are no NaNs
both_together.isna().sum().sum()

In [None]:
# # save off csv
# both_together.to_csv('both_top_mri_together.csv')

In [None]:
# make and save of csv of features only
features_only = both_together[2:]
#features_only.to_csv('features_only_top_mri.csv')

In [None]:
features_only.T.columns

In [None]:
dictionary_features_len = len(features_only.T.columns)
number = 0
made_keys = [] 
made_vals = [] 
for n in features_only.T.columns:
    
    made_keys.append(number)
    made_vals.append(n)
    number +=1

In [None]:
feature_dict = dict(map(lambda i,j : (i,j) , made_keys,made_vals))

In [None]:
ft = features_only.reset_index()
ft = ft.rename(columns={"index": "A"})
ft = ft.drop(['A'], axis=1)
ft = ft.dropna()

In [None]:
bt= both_together.reset_index()
bt = bt.rename(columns={"index": "A"})
bt = bt.drop(['A'], axis=1)
bt = bt.dropna()

In [None]:
bt.head(3)

In [None]:
#ft.to_csv('ft_top_mri.csv')

In [None]:
data = np.genfromtxt('ft_top_mri.csv', delimiter=",", skip_header=1)
data

In [None]:
data = data[:, 1:]
#data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len(mri_selection.columns)
last_columns_as_two = [2] * len(top_selection.columns)
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_together.loc['sex',:].values.tolist(),
           'age':both_together.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
#covars

In [None]:
covars.shape

In [None]:
data.shape

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    categorical_cols=categorical_cols)["data"]

In [None]:
data_combat

In [None]:
data_combat.shape

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
topper = bt.head(2)

In [None]:
topper = topper.rename_axis(None, axis="columns")

In [None]:
topper

In [None]:
topper = topper.reset_index(drop=False)
topper = topper.rename(columns={"index": "char"})

In [None]:
topper['char'][0] = 'age'
topper['char'][1] = 'sex'

In [None]:
topper

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topper.columns
#bottom = bottom.set_index('char', inplace=True)
#set_index('Name', inplace=True)
bottom

In [None]:
back_together = pd.concat([topper, bottom])
back_together = back_together.T
back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
back_together

In [None]:
back_together.head(514).tail(10)

In [None]:
back_together.tail(527).head(10)

In [None]:
neuro_harm_top =back_together.tail(527)
neuro_harm_mri =back_together.head(514)

In [None]:
neuro_harm_top = neuro_harm_top.rename(feature_dict, axis='columns')
neuro_harm_mri = neuro_harm_mri.rename(feature_dict, axis='columns')

In [None]:
# neuro_harm_mri.to_csv('neuro_harm_mri.csv')
# neuro_harm_top.to_csv('neuro_harm_top.csv')

In [None]:
# Investigate neuroharmoney results

In [None]:
neuro_harm_mri.tail(3) 

## We see neuroharmoney added negative numbers 
let's compare to the original

In [None]:
mri_selection.T.tail(3)

## let's see how many negative numbers we have

In [None]:
number_columns = ['gm_vol',
                  'wm_vol',
                  'csf_vol',
                  'gm_ivc_ratio', 
                  'gmwm_ivc_ratio',
                  'wmh_vol',
                  'wmh_count',
                  'deepwm_b_cov',
                  'aca_b_cov',
                  'mca_b_cov',
                  'pca_b_cov',
                  'totalgm_b_cov',
                  'deepwm_b',
                  'aca_b',
                  'mca_b',
                  'pca_b',
                  'totalgm_b',]

In [None]:
(neuro_harm_mri[number_columns] < 0).sum()

In [None]:
#our_top_data.head(3)
big_top = our_top_data.rename(columns={"Unnamed: 0": "level_0"})
big_top.head(3)

In [None]:
new_harm_top = neuro_harm_top.reset_index()
new_harm_top = new_harm_top.reset_index()
new_harm_top = new_harm_top.rename(columns={"index": "participant_id"})
new_harm_top.head(3) 

In [None]:
# difference in harmonized and original - raw
raw_differences = big_top[number_columns] - new_harm_top[number_columns]

In [None]:
# difference in harmonized and original - raw
differences = (big_top[number_columns] - new_harm_top[number_columns])/big_top[number_columns]
differences

In [None]:
differences_n = differences.apply(pd.to_numeric) #

In [None]:
differences_n.describe()

## Now we can use the versions of TOP with log base 10 revision of white matter hyperintensity count, as well as white matter hyperintensity  volume

In [None]:
log_filepath = '../open_work/internal_results/loggy/' 
log_filename_mri = os.path.join(log_filepath,'stroke_loged.csv') 
log_filename_top = os.path.join(log_filepath,'top_loged.csv') 

In [None]:
log_mri_data = pd.read_csv(log_filename_mri)
log_top_data = pd.read_csv(log_filename_top)

In [None]:
#log_top_data.head(3)

In [None]:
#log_mri_data.head(3)

In [None]:
log_top_selection = log_top_data.drop(['Unnamed: 0'],axis=1)
log_mri_selection = log_mri_data.drop(['Unnamed: 0'],axis=1)

In [None]:
log_mri_selection = log_mri_selection.set_index('participant_id')
log_top_selection = log_top_selection.set_index('participant_id')

In [None]:
log_top_selection = log_top_selection.T
log_mri_selection = log_mri_selection.T
#log_mri_selection

In [None]:
log_both_together = pd.concat([log_mri_selection, log_top_selection], axis=1, join="inner")
log_both_together

In [None]:
# make sure there are no NaNs
log_both_together.isna().sum().sum()

In [None]:
# make and save of csv of features only
log_features_only = log_both_together[2:]
#log_features_only.to_csv('log_features_only_top_mri.csv')

In [None]:
#log_features_only.T.columns

In [None]:
#features_only.T.columns
## so we can use same features dictionary

In [None]:
log_ft = log_features_only.reset_index()
log_ft = log_ft.rename(columns={"index": "A"})
log_ft = log_ft.drop(['A'], axis=1)
log_ft = log_ft.dropna()

In [None]:
log_bt = log_both_together.reset_index()
log_bt = log_bt.rename(columns={"index": "A"})
log_bt = log_bt.drop(['A'], axis=1)
log_bt = log_bt.dropna()

In [None]:
log_ft.to_csv('log_ft_top_mri.csv')

In [None]:
log_data = np.genfromtxt('log_ft_top_mri.csv', delimiter=",", skip_header=1)
#log_data

In [None]:
log_data = log_data[:, 1:]
#log_data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len(log_mri_selection.columns)
last_columns_as_two = [2] * len(log_top_selection.columns)
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':log_both_together.loc['sex',:].values.tolist(),
           'age':log_both_together.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
log_data_combat = neuroCombat(dat=log_data,
    covars=covars,
    batch_col=batch_col_mine,
    categorical_cols=categorical_cols)["data"]

In [None]:
log_data_combat

In [None]:
log_neurocombat = pd.DataFrame(data_combat)
log_neurocombat

In [None]:
log_topper = log_bt.head(2)

In [None]:
log_topper = log_topper.rename_axis(None, axis="columns")
log_topper

In [None]:
log_topper = log_topper.reset_index(drop=False)
log_topper = log_topper.rename(columns={"index": "char"})

In [None]:
log_topper['char'][0] = 'age'
log_topper['char'][1] = 'sex'

In [None]:
log_bottom = log_neurocombat.reset_index(drop=False)
log_bottom = log_bottom.rename(columns={"index": "char"})
log_bottom.columns = topper.columns
log_bottom

In [None]:
log_back_together = pd.concat([log_topper, log_bottom])
log_back_together = log_back_together.T
log_back_together

In [None]:
log_new_header = log_back_together.iloc[0] #grab the first row for the header
log_back_together.columns = new_header #set the header row as the df header
log_back_together = log_back_together[1:]
log_back_together

In [None]:
log_back_together.head(514).tail(5)

In [None]:
log_back_together.tail(527).head(5)

In [None]:
log_neuro_harm_top =log_back_together.tail(527)
log_neuro_harm_mri =log_back_together.head(514)

In [None]:
log_neuro_harm_top = log_neuro_harm_top.rename(feature_dict, axis='columns')
log_neuro_harm_mri = log_neuro_harm_mri.rename(feature_dict, axis='columns')

In [None]:
# log_neuro_harm_mri.to_csv('log_neuro_harm_mri.csv')
# log_neuro_harm_top.to_csv('log_neuro_harm_top.csv')

In [None]:
#log_neuro_harm_mri
(log_neuro_harm_mri[number_columns] < 0).sum()

In [None]:
## There are still negative numbers in the outcome...but we can test to see if there is a difference

In [None]:
log_neuro_harm_mri.head(3)# - neuro_harm_mri

In [None]:
neuro_harm_mri.head(3)

In [None]:
dif_log_to_reg = log_neuro_harm_mri[number_columns] - neuro_harm_mri[number_columns]
dif_log_to_reg 

In [None]:
dif_log_to_reg.sum().sum()

# So changing the two columns to theirlog made literally no difference in the harmarmonized outcomes? Is there a bug??