# Neurocombat 5 way harmonized datasets
includes data preparation for RELIEF implementations

Note this must be run in the `neuron` or `neuron_go` environment

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np


sys.path.insert(0, '../../')  # path to functions
import cvasl.vendor.neurocombat.neurocombat as neurocombat
import cvasl.harmony as har

## We read in our data


In [None]:
# Datasets for this work
EDIS = pd.read_csv('../new_data/TrainingDataComplete_EDIS.csv')
HELIUS = pd.read_csv('../new_data/TrainingDataComplete_HELIUS.csv')
SABRE = pd.read_csv('../new_data/TrainingDataComplete_SABRE.csv')
MRI = pd.read_csv('../new_data/TrainingDataComplete_StrokeMRI.csv')
TOP = pd.read_csv('../new_data/TrainingDataComplete_TOP.csv')
INSI = pd.read_csv('../new_data/TrainingDataComplete_Insight46.csv')

In [None]:
## We found there is one aprticipant from two seperate sights (HELIUS and SABRE) named the same. There fore we will show and switch this

In [None]:
HELIUS[HELIUS['participant_id']=='sub-153852_1']

In [None]:
SABRE[SABRE['participant_id']=='sub-153852_1']

In [None]:
HELIUS.loc[HELIUS['participant_id']=='sub-153852_1', 'participant_id'] = 'sub-153852_1H'

In [None]:
HELIUS.head(3)

In [None]:
TOP = TOP.drop(['Site', 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['Site', 'ID'],axis = 1)
MRI.tail(3)

In [None]:
TOPMRI = pd.concat([TOP,MRI])
TOPMRI.head(3)

In [None]:
HELIUS = HELIUS.drop(['Site', 'ID'],axis = 1)
HELIUS.tail(3)

In [None]:
EDIS = EDIS.drop(['Site', 'ID'],axis = 1)
INSI = INSI.drop(['Site', 'ID'],axis = 1)
SABRE = SABRE.drop(['Site', 'ID'],axis = 1)

In [None]:
EDIS.columns

In [None]:
datasets1 = [TOPMRI, HELIUS, EDIS, SABRE, INSI]
datasets = []
for myset in datasets1:
    myset = myset.drop(['LD', 'PLD', 'Labelling', 'Readout', 'M0'], axis=1)
    datasets.append(myset)
print(len(datasets1[0]))     
print(len(datasets1[1]))     
print(len(datasets1[2]))     
print(len(datasets1[3]))     
print(len(datasets1[4]))     
TOPMRI = datasets[0]
HELIUS = datasets[1] 
EDIS   = datasets[2]
SABRE  = datasets[3]
INSI   = datasets[4]    
print(len(TOPMRI))     
print(len(HELIUS))     
print(len(EDIS))     
print(len(SABRE))     
print(len(INSI))    

In [None]:
TOPMRI.shape

In [None]:
TOPMRI.columns

In [None]:
TOPMRI_all = TOPMRI
HELIUS_all = HELIUS
EDIS_all = EDIS 
SABRE_all = SABRE 
INSI_all = INSI 
to_be_harmonized_or_covar = [
    'Age', 'Sex','DeepWM_B_CoV', 'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
    'DeepWM_B_CBF', 'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',
]
not_harmonized= ['GM_vol', 'WM_vol', 'CSF_vol','GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',]
TOPMRI_semi_features = TOPMRI.drop(to_be_harmonized_or_covar,axis=1)
HELIUS_semi_features = HELIUS.drop(to_be_harmonized_or_covar,axis=1)
EDIS_semi_features = EDIS.drop(to_be_harmonized_or_covar,axis=1) 
SABRE_semi_features = SABRE.drop(to_be_harmonized_or_covar,axis=1) 
INSI_semi_features = INSI.drop(to_be_harmonized_or_covar,axis=1)

TOPMRI = TOPMRI.drop(not_harmonized,axis=1)
HELIUS = HELIUS.drop(not_harmonized,axis=1)
EDIS = EDIS.drop(not_harmonized,axis=1) 
SABRE= SABRE.drop(not_harmonized,axis=1) 
INSI = INSI.drop(not_harmonized,axis=1)

In [None]:
INSI

# neurocombat prep

In [None]:
all_togetherF, ftF, btF, feature_dictF, len1, len2, len3, len4, len5 = har.prep_for_neurocombat_5way(TOPMRI, HELIUS, EDIS, SABRE, INSI)

In [None]:
all_togetherF

In [None]:
len1, len2,len3,len4,len5, feature_dictF

In [None]:
feature_dictF

In [None]:
# save mother frame for RELIEF
all_togetherF.to_csv('all_togeherf5.csv')
# ftF, 
# btF, 
# feature_dictF, 
# len1, 
# len2, 
# len3, len4, len5 

In [None]:
ftF.to_csv('ftF_top5.csv')

In [None]:
data = np.genfromtxt('ftF_top5.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
#save data array for RELIEF processing
np.save('dat_var_for_RELIEF5.npy', data)

In [None]:
data.shape

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
second_columns_as_two = [2] * len2
third_columns_as_three = [3] * len3
fourth_columns_as_four = [4] * len4
fifth_columns_as_five = [5] * len5
covars = {'batch':first_columns_as_one + second_columns_as_two + third_columns_as_three + fourth_columns_as_four + fifth_columns_as_five,
          #'Site': both_togetherF.loc['Site',:].values.tolist(), 
          'Sex':all_togetherF.loc['Sex',:].values.tolist(),
           'Age':all_togetherF.loc['Age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# save covars for RELIEF processing
covars.to_csv('bath_and_mod_forRELIEF5.csv')


In [None]:
# specifify sex as categorical
categorical_cols = ['Sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['Age']
#Harmonization step:
data_combat = neurocombat.neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
topperF = har.make_topper(btF,'Age', 'Sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
len5

In [None]:
neuro_harm_topmri = back_together.head(len1)
neuro_harm_HELIUS =back_together.head(len1+len2).tail(len2)
neuro_harm_EDIS = back_together.head(len1+len2+ len3).tail(len3)
neuro_harm_SABRE = back_together.head(len1+len2+ len3 +len4).tail(len4)
neuro_harm_INSI = back_together.head(len1+len2+ len3 +len4+ len5).tail(len5)

In [None]:
neuro_harm_topmri.head(3)

In [None]:
neuro_harm_HELIUS.head(3)

In [None]:
neuro_harm_EDIS.head(3)

In [None]:
neuro_harm_SABRE.head(3) 

In [None]:
neuro_harm_INSI.head(3)

In [None]:
neuro_harm_topmri = neuro_harm_topmri.rename(feature_dictF, axis='columns')
neuro_harm_EDIS = neuro_harm_EDIS.rename(feature_dictF, axis='columns')
neuro_harm_SABRE = neuro_harm_SABRE.rename(feature_dictF, axis='columns')
neuro_harm_INSI = neuro_harm_INSI.rename(feature_dictF, axis='columns')
neuro_harm_HELIUS = neuro_harm_HELIUS.rename(feature_dictF, axis='columns')
neuro_harm_HELIUS

In [None]:
neuro_harm_EDIS = neuro_harm_EDIS.reset_index()
neuro_harm_EDIS = neuro_harm_EDIS.rename(columns={"index": "participant_id"})
neuro_harm_HELIUS = neuro_harm_HELIUS.reset_index()
neuro_harm_HELIUS = neuro_harm_HELIUS.rename(columns={"index": "participant_id"})
neuro_harm_SABRE = neuro_harm_SABRE.reset_index()
neuro_harm_SABRE = neuro_harm_SABRE.rename(columns={"index": "participant_id"})
neuro_harm_topmri = neuro_harm_topmri.reset_index()
neuro_harm_topmri = neuro_harm_topmri.rename(columns={"index": "participant_id"})
neuro_harm_INSI = neuro_harm_INSI.reset_index()
neuro_harm_INSI = neuro_harm_INSI.rename(columns={"index": "participant_id"})

In [None]:
neuro_harm_EDIS = neuro_harm_EDIS.merge(EDIS_semi_features, on="participant_id")
neuro_harm_HELIUS = neuro_harm_HELIUS.merge(HELIUS_semi_features, on="participant_id")
neuro_harm_SABRE = neuro_harm_SABRE.merge( SABRE_semi_features, on="participant_id")
neuro_harm_topmri = neuro_harm_topmri.merge(TOPMRI_semi_features, on="participant_id")
neuro_harm_INSI = neuro_harm_INSI.merge(INSI_semi_features, on="participant_id")
neuro_harm_INSI

Save off to harmonized csv files

In [None]:
neuro_harm_INSI.head(3)

In [None]:
neuro_harm_HELIUS.to_csv('harm_results/5neuro_harm_HELIUS.csv')
neuro_harm_topmri.to_csv('harm_results/5neuro_harm_topmri.csv')
neuro_harm_INSI.to_csv('harm_results/5neuro_harm_INSI.csv')
neuro_harm_SABRE.to_csv('harm_results/5neuro_harm_SABRE.csv')
neuro_harm_EDIS.to_csv('harm_results/5neuro_harm_EDIS.csv')

In [None]:
neuro_harm_HELIUS.columns

# Data quality check

In [None]:
number_columns = ['Age', 'Sex', 'GM_vol', 'WM_vol', 'CSF_vol',
       'GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',
       'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
       'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',]

In [None]:
TOPMRI

In [None]:
neuro_harm_topmri

In [None]:
neuro_harm_HELIUS.isna().sum().sum()

In [None]:
neuro_harm_topmri.isna().sum().sum()

In [None]:
#neuro_harm_top.head(3)
(neuro_harm_HELIUS[number_columns] < 0).sum()

In [None]:
(neuro_harm_topmri[number_columns] < 0).sum()

In [None]:
(neuro_harm_EDIS[number_columns] < 0).sum()

In [None]:
(neuro_harm_INSI[number_columns] < 0).sum()

In [None]:
(neuro_harm_SABRE[number_columns] < 0).sum()