# RELIEF 5 way harmonized datasets
includes data preparation for RELIEF implementations; these are accomplished in an auxilary notebook, the here final data is processed

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np


sys.path.insert(0, '../../')  # path to functions
import cvasl.vendor.neurocombat.neurocombat as neurocombat
import cvasl.harmony as har

## We read in our data


In [None]:
# Datasets for this work
EDIS_path = '../our_datasets/EDIS/'
HELIUS_path = '../our_datasets/HELIUS/'
Insight46_path = '../our_datasets/Insight46/'
HELIUS_path = '../our_datasets/HELIUS/'
SABRE_path = '../our_datasets/SABRE/'
MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
EDIS_file = os.path.join(EDIS_path, file_name)
SABRE_file = os.path.join(SABRE_path, file_name)
Insight46_file = os.path.join(Insight46_path, file_name)
HELIUS_file = os.path.join(HELIUS_path, file_name)

TOP = pd.read_csv(TOP_file, index_col=0 )
MRI = pd.read_csv(MRI_file, index_col=0)
HELIUS = pd.read_csv(HELIUS_file, index_col=0) # 
EDIS = pd.read_csv(EDIS_file, index_col=0 )
SABRE = pd.read_csv(SABRE_file, index_col=0)
INSI = pd.read_csv(Insight46_file, index_col=0) # 

In [None]:
## We found there is one aprticipant from two seperate sights (HELIUS and SABRE) named the same. There fore we will show and switch this

In [None]:
HELIUS[HELIUS['participant_id']=='sub-153852_1']

In [None]:
SABRE[SABRE['participant_id']=='sub-153852_1']

In [None]:
HELIUS.loc[HELIUS['participant_id']=='sub-153852_1', 'participant_id'] = 'sub-153852_1H'

In [None]:
HELIUS.Sex.unique()

In [None]:
HELIUS.head(3)

In [None]:
sex_mapping = {1:0,2:1}

EDIS = EDIS.assign(Sex = EDIS.Sex.map(sex_mapping))
EDIS.head(3)

In [None]:
INSI = INSI.assign(Sex = INSI.Sex.map(sex_mapping))
INSI.head(3)

In [None]:
SABRE = SABRE.assign(Sex = SABRE.Sex.map(sex_mapping))
SABRE.head(3)

In [None]:
TOP = TOP.drop(['Site', 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['Site', 'ID'],axis = 1)
MRI.tail(3)

In [None]:
TOPMRI = pd.concat([MRI ,TOP])
TOPMRI.head(3)

In [None]:
HELIUS = HELIUS.drop(['Site', 'ID'],axis = 1)
HELIUS.tail(3)

In [None]:
EDIS = EDIS.drop(['Site', 'ID'],axis = 1)
INSI = INSI.drop(['Site', 'ID'],axis = 1)
SABRE = SABRE.drop(['Site', 'ID'],axis = 1)

In [None]:
EDIS.columns

In [None]:
TOPMRI.shape

# relief/neurocombat prep

In [None]:
all_togetherF, ftF, btF, feature_dictF, len1, len2, len3, len4, len5 = har.prep_for_neurocombat_5way(TOPMRI, HELIUS, EDIS, SABRE, INSI)

In [None]:
all_togetherF

In [None]:
#len1, len2,len3,len4,len5, feature_dictF

In [None]:
#feature_dictF

In [None]:
# # saved mother frame for RELIEF
all_togetherF.to_csv('all_togeherf5.csv')


In [None]:
ftF.to_csv('ftF_top5.csv')

In [None]:
data = np.genfromtxt('ftF_top5.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# #save data array for RELIEF processing
np.save('dat_var_for_RELIEF5.npy', data)

In [None]:
data.shape

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
second_columns_as_two = [2] * len2
third_columns_as_three = [3] * len3
fourth_columns_as_four = [4] * len4
fifth_columns_as_five = [5] * len5
covars = {'batch':first_columns_as_one + second_columns_as_two + third_columns_as_three + fourth_columns_as_four + fifth_columns_as_five,
          #'Site': both_togetherF.loc['Site',:].values.tolist(), 
          'Sex':all_togetherF.loc['Sex',:].values.tolist(),
           'Age':all_togetherF.loc['Age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# # save covars for RELIEF processing
covars.to_csv('bath_and_mod_forRELIEF5.csv')


In [None]:
# #at this point te pre-data is run oin the auxilary notebook to produce "relief1_for5_results.csv"

In [None]:
topperF = har.make_topper(btF,'Age', 'Sex')

In [None]:
topperF

In [None]:
bottom = pd.read_csv("relief1_for5_results.csv", index_col=0)
bottom

In [None]:
bottom = bottom.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns
bottom

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
#len5

In [None]:
relief_harm_topmri = back_together.head(len1)
relief_harm_HELIUS =back_together.head(len1+len2).tail(len2)
relief_harm_EDIS = back_together.head(len1+len2+ len3).tail(len3)
relief_harm_SABRE = back_together.head(len1+len2+ len3 +len4).tail(len4)
relief_harm_INSI = back_together.head(len1+len2+ len3 +len4+ len5).tail(len5)

In [None]:
relief_harm_topmri.head(3)

In [None]:
relief_harm_HELIUS.head(3)

In [None]:
relief_harm_EDIS.head(3)

In [None]:
relief_harm_SABRE.head(3) 

In [None]:
relief_harm_INSI.head(3)

In [None]:
feature_dictF

In [None]:
new_feature_dict =  har.increment_keys(feature_dictF)
new_feature_dict

In [None]:
relief_harm_topmri

In [None]:
relief_harm_topmri = relief_harm_topmri.rename(new_feature_dict, axis='columns')
relief_harm_EDIS = relief_harm_EDIS.rename(new_feature_dict, axis='columns')
relief_harm_SABRE = relief_harm_SABRE.rename(new_feature_dict, axis='columns')
relief_harm_INSI = relief_harm_INSI.rename(new_feature_dict, axis='columns')
relief_harm_HELIUS = relief_harm_HELIUS.rename(new_feature_dict, axis='columns')
relief_harm_HELIUS

In [None]:
relief_harm_EDIS   = relief_harm_EDIS.reset_index()
relief_harm_EDIS   = relief_harm_EDIS.rename(columns={"index": "participant_id"})
relief_harm_HELIUS = relief_harm_HELIUS.reset_index()
relief_harm_HELIUS = relief_harm_HELIUS.rename(columns={"index": "participant_id"})
relief_harm_SABRE  = relief_harm_SABRE.reset_index()
relief_harm_SABRE  = relief_harm_SABRE.rename(columns={"index": "participant_id"})
relief_harm_topmri = relief_harm_topmri.reset_index()
relief_harm_topmri = relief_harm_topmri.rename(columns={"index": "participant_id"})
relief_harm_INSI   = relief_harm_INSI.reset_index()
relief_harm_INSI   = relief_harm_INSI.rename(columns={"index": "participant_id"})

Save off to harmonized csv files

In [None]:
relief_harm_INSI.head(3)

In [None]:
path = 'harm_results/relief/'
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:
   # Create a new directory because it does not exist
   os.makedirs(path)
   print("The new directory is created!")

In [None]:
relief_harm_HELIUS.to_csv('harm_results/relief/5relief_harm_HELIUS.csv')
relief_harm_topmri.to_csv('harm_results/relief/5relief_harm_topmri.csv')
relief_harm_INSI.to_csv('harm_results/relief/5relief_harm_INSI.csv')
relief_harm_SABRE.to_csv('harm_results/relief/5relief_harm_SABRE.csv')
relief_harm_EDIS.to_csv('harm_results/relief/5relief_harm_EDIS.csv')

In [None]:
relief_harm_HELIUS.columns

In [None]:
relief_harm_HELIUS

In [None]:
number_columns = ['Age', 'Sex', 'GM_vol', 'WM_vol', 'CSF_vol',
       'GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',
       'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
       'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',]

# Data quality check

In [None]:
TOPMRI

In [None]:
relief_harm_topmri

In [None]:
relief_harm_HELIUS.isna().sum().sum()

In [None]:
relief_harm_topmri.isna().sum().sum()

In [None]:
#neuro_harm_top.head(3)
(relief_harm_HELIUS[number_columns] < 0).sum()

In [None]:
(relief_harm_topmri[number_columns] < 0).sum()

In [None]:
(relief_harm_EDIS[number_columns] < 0).sum()

In [None]:
(relief_harm_INSI[number_columns] < 0).sum()

In [None]:
(relief_harm_SABRE[number_columns] < 0).sum()