# Neurocombat TOPMRI and SABRE harmonized datasets

Note this must be run in the `neuron` or `neuron_go` environment

## import libraries

In [None]:
import os
import sys
import pandas as pd
import numpy as np

sys.path.insert(0, '../../')  # path to functions
import cvasl.vendor.neurocombat.neurocombat as neurocombat
import cvasl.harmony as har

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
# Datasets for this work

SABRE = pd.read_csv('../new_data/TrainingDataComplete_SABRE.csv')
MRI = pd.read_csv('../new_data/TrainingDataComplete_StrokeMRI.csv')
TOP = pd.read_csv('../new_data/TrainingDataComplete_TOP.csv')

In [None]:
TOP = TOP.drop(['Site', 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['Site', 'ID'],axis = 1)
MRI.tail(3)

In [None]:
TOPMRI = pd.concat([MRI ,TOP])
TOPMRI.head(3)

In [None]:
SABRE = SABRE.drop(['Site', 'ID'],axis = 1)
SABRE.tail(3)

In [None]:
TOPMRI.shape

In [None]:
datasets1 = [TOPMRI,  SABRE, ]
datasets = []
for myset in datasets1:
    myset = myset.drop(['LD', 'PLD', 'Labelling', 'Readout', 'M0'], axis=1)
    datasets.append(myset)
print(len(datasets1[0]))     
print(len(datasets1[1]))     
   
TOPMRI = datasets[0]
SABRE = datasets[1]
print(len(TOPMRI))     
print(len(SABRE))     

In [None]:
TOPMRI.shape

In [None]:
TOPMRI.columns

In [None]:
to_be_harmonized_or_covar = [
    'Age', 'Sex','DeepWM_B_CoV', 'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
    'DeepWM_B_CBF', 'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',
]
not_harmonized= ['GM_vol', 'WM_vol', 'CSF_vol','GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',]
TOPMRI_semi_features = TOPMRI.drop(to_be_harmonized_or_covar,axis=1)
#HELIUS_semi_features = HELIUS.drop(to_be_harmonized_or_covar,axis=1)
#HELIUS_semi_features = HELIUS.drop(to_be_harmonized_or_covar,axis=1) 
SABRE_semi_features = SABRE.drop(to_be_harmonized_or_covar,axis=1) 
#INSI_semi_features = INSI.drop(to_be_harmonized_or_covar,axis=1)

TOPMRI = TOPMRI.drop(not_harmonized,axis=1)
#HELIUS = HELIUS.drop(not_harmonized,axis=1)
SABRE = SABRE.drop(not_harmonized,axis=1) 


In [None]:
TOPMRI_semi_features

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = har.prep_for_neurocombat(TOPMRI, SABRE)

In [None]:
both_togetherF

In [None]:
ftF.to_csv('ftF_top_mri_sabre.csv')

In [None]:
data = np.genfromtxt('ftF_top_mri_sabre.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
TOPMRI.head(3)

In [None]:
both_togetherF.head(3)

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          #'Site': both_togetherF.loc['Site',:].values.tolist(), 
          'Sex':both_togetherF.loc['Sex',:].values.tolist(),
           'Age':both_togetherF.loc['Age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['Sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['Age']
#Harmonization step:
data_combat = neurocombat.neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
topperF = har.make_topper(btF,'Age', 'Sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
back_together.head(3)

In [None]:
neuro_harm_sabre =back_together.tail(len2)
neuro_harm_topmri =back_together.head(len1)

In [None]:
neuro_harm_topmri = neuro_harm_topmri.rename(feature_dictF, axis='columns')
neuro_harm_sabre = neuro_harm_sabre.rename(feature_dictF, axis='columns')

In [None]:
neuro_harm_sabre = neuro_harm_sabre.reset_index()
neuro_harm_sabre = neuro_harm_sabre.rename(columns={"index": "participant_id"})
neuro_harm_sabre

In [None]:
neuro_harm_topmri = neuro_harm_topmri.reset_index()
neuro_harm_topmri = neuro_harm_topmri.rename(columns={"index": "participant_id"})
neuro_harm_topmri

Save off to harmonized csv files

In [None]:
neuro_harm_sabre.to_csv('harm_results/neuro_harm_sabre.csv')
neuro_harm_topmri.to_csv('harm_results/neuro_harm_topmri_from_sabre.csv')

In [None]:
#neuro_harm_sabre.columns

In [None]:
number_columns = ['Age', 'Sex', 'GM_vol', 'WM_vol', 'CSF_vol',
       'GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',
       'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
       'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',]

# Data quality check

In [None]:
TOPMRI

In [None]:
neuro_harm_topmri

In [None]:
neuro_harm_sabre

In [None]:
neuro_harm_sabre.isna().sum().sum()

In [None]:
neuro_harm_topmri.isna().sum().sum()

In [None]:
#neuro_harm_top.head(3)
(neuro_harm_sabre[number_columns] < 0).sum()

In [None]:
(neuro_harm_topmri[number_columns] < 0).sum()

## So this new variable (WMHvol_WMvol ) is again problematic, BUT ALSO MORE 

specifically:

DeepWM_B_CoV      
ACA_B_CoV         
MCA_B_CoV         
PCA_B_CoV         
TotalGM_B_CoV     
DeepWM_B_CoV     
DeepWM_B_CBF     

 We will wait about logging it until we see other columns we may want to log across all datasets