# Open nested combat: TOP and StrokeMRI combined (topmri) and EDIS open nested-harmonized datasets

Note this must be run in the `opnc` environment

## import libraries

In [None]:
import os
import sys
#from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets


from sklearn.preprocessing import LabelEncoder


sys.path.insert(0, '../../') # path to cvasl functions
import cvasl.seperated as sep
import cvasl.harmony as har
import cvasl.vendor.open_nested_combat.nest as nest

## Read in our data

In [None]:
# Datasets for this work
EDIS = pd.read_csv('../new_data/TrainingDataComplete_EDIS.csv')
MRI = pd.read_csv('../new_data/TrainingDataComplete_StrokeMRI.csv')
TOP = pd.read_csv('../new_data/TrainingDataComplete_TOP.csv')


In [None]:
TOP.head(3)

In [None]:
MRI.tail(3)

In [None]:
topmri = pd.concat([MRI ,TOP])
topmri.head(3)

In [None]:
EDIS.tail(3)

In [None]:
topmri.shape

In [None]:
topmri.columns = topmri.columns.str.lower()
topmri = topmri.drop(['site', 'id'], axis= 1)
topmri.head(3)

In [None]:
EDIS.columns = EDIS.columns.str.lower()
EDIS = EDIS.drop(['site', 'id'], axis= 1)
EDIS.tail(3)

In [None]:
topmri_batch_testing_df = topmri[['participant_id','age', 'sex']]
topmri_batch_testing_df['site'] = 0
column_to_move = topmri_batch_testing_df.pop("site")
topmri_batch_testing_df.insert(1, "site", column_to_move)
topmri_batch_testing_df.head(3)

In [None]:
edis_batch_testing_df = EDIS[['participant_id','age', 'sex']]
edis_batch_testing_df['site'] = 1
column_to_move = edis_batch_testing_df.pop("site")
edis_batch_testing_df.insert(1, "site", column_to_move)
edis_batch_testing_df.head(3)

In [None]:
batch_testing_df = pd.concat([topmri_batch_testing_df, edis_batch_testing_df], ignore_index=True)
#batch_testing_df = sep.recode_sex_to_numeric(batch_testing_df)
batch_testing_df.tail(5)

In [None]:
# Loading in batch effects
batch_testing_list = ['site']
# Loading in clinical covariates
categorical_testing_cols = ['sex']
continuous_testing_cols = ['age']

In [None]:

to_be_harmonized_or_covar = [
    'age', 'sex','deepWM_B_CoV', 'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV', 'TotalGM_B_CoV',
    'DeepWM_B_CBF', 'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',
]
to_be_harmonized_or_covar  = [x.lower() for x in to_be_harmonized_or_covar ]
not_harmonized= ['GM_vol', 'WM_vol', 'CSF_vol','GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',
                 'ld', 'pld', 'labelling', 'readout','M0']
not_harmonized  = [x.lower() for x in not_harmonized ]
TOPMRI_semi_features = topmri.drop(to_be_harmonized_or_covar,axis=1)
#HELIUS_semi_features = HELIUS.drop(to_be_harmonized_or_covar,axis=1)
EDIS_semi_features = EDIS.drop(to_be_harmonized_or_covar,axis=1) 
#SABRE_semi_features = SABRE.drop(to_be_harmonized_or_covar,axis=1) 
#INSI_semi_features = INSI.drop(to_be_harmonized_or_covar,axis=1)

TOPMRI = topmri.drop(not_harmonized,axis=1)
#HELIUS = HELIUS.drop(not_harmonized,axis=1)
EDIS = EDIS.drop(not_harmonized,axis=1) 
#SABRE= SABRE.drop(not_harmonized,axis=1) 
#INSI = INSI.drop(not_harmonized,axis=1)

In [None]:
TOPMRI_semi_features, EDIS_semi_features, = sep.deal_with_readout_and_labelling(
    [TOPMRI_semi_features, EDIS_semi_features],
    ['m0'])

In [None]:
# ASL
data_testing_df = pd.concat([TOPMRI,EDIS])
#data_testing_df = # ASL
data_testing_df = data_testing_df.drop(columns=['age','sex'])  
data_testing_df.tail(3)

In [None]:
data_testing_df = data_testing_df.reset_index(drop=True)
data_testing_df = data_testing_df.dropna()
print(data_testing_df.shape)
data_testing_df = data_testing_df.merge(batch_testing_df['participant_id'], 
                                        left_on='participant_id', right_on='participant_id')
dat_testing = data_testing_df.iloc[:, 1:]
dat_testing = dat_testing.T.apply(pd.to_numeric)
caseno_testing = data_testing_df['participant_id']
covars_testing = batch_testing_df.drop('participant_id',axis=1)

In [None]:
batch_testing_df.shape

In [None]:
data_testing_df.shape

In [None]:
# Merging batch effects, clinical covariates
covars_testing_string = pd.DataFrame()
covars_testing_string[categorical_testing_cols] = covars_testing[categorical_testing_cols].copy()
covars_testing_quant = covars_testing[continuous_testing_cols]
#covars_testing_quant

In [None]:
# Encoding categorical variables
covars_testing_cat = pd.DataFrame()
for col_testing in covars_testing_string:
    stringcol_testing = covars_testing_string[col_testing]
    le = LabelEncoder()
    le.fit(list(stringcol_testing))
    covars_testing_cat[col_testing] = le.transform(stringcol_testing)
#covars_testing_cat

In [None]:
covars_testing_final = pd.concat([covars_testing_cat, covars_testing_quant], axis=1)
#covars_testing_final

In [None]:
# # # FOR GMM COMBAT VARIANTS:
# # # Adding GMM Split to batch effects
filepath2 = 'Testing/OPPNComBat/ResultTesting'
if not os.path.exists(filepath2):
    os.makedirs(filepath2)
gmm_testing_df = nest.GMMSplit(dat_testing, caseno_testing, filepath2)

In [None]:
batch_testing_df.head(2)

In [None]:
gmm_testing_df.head(2)

In [None]:
gmm_testing_df_merge = batch_testing_df.merge(gmm_testing_df, right_on='Patient', left_on='participant_id')
gmm_testing_df_merge['GMM'] = gmm_testing_df_merge['Grouping'] 
gmm_testing_df_merge

In [None]:
covars_testing_final = gmm_testing_df_merge.drop(['participant_id','Patient','Grouping'],axis=1)
categorical_testing_cols = categorical_testing_cols + ['GMM']

In [None]:
output_testing_df = nest.OPNestedComBat(dat_testing,
                                   covars_testing_final,
                                   batch_testing_list,
                                   filepath2, categorical_cols=categorical_testing_cols,
                                  continuous_cols=continuous_testing_cols)

In [None]:
write_testing_df = pd.concat([caseno_testing, output_testing_df], axis=1) 
write_testing_df.to_csv(filepath2+'/Mfeatures_testing_NestedComBat.csv') # write results fo file
dat_testing_input = dat_testing.transpose()
dat_testing_input.to_csv(filepath2+'/Mfeatures_input_testing_NestedComBat.csv')
covars_testing_final.to_csv(filepath2+'/Mcovars_input_testing_NestedComBat.csv')

In [None]:
# write harmonized dataset 
complete_harmonised = pd.concat([write_testing_df, covars_testing_final], axis=1) 
complete_harmonised.head(3)

In [None]:
complete_harmonised.columns

# split dataframe back up into parts for running
## from complete_harmonised

In [None]:
edis_opn_harmonized = complete_harmonised[complete_harmonised['site'] == 1]
edis_opn_harmonized = edis_opn_harmonized.drop(columns=['site', 'GMM',])
edis_opn_harmonized.head(3)

In [None]:
topmri_opn_harmonized = complete_harmonised[complete_harmonised['site'] == 0]
topmri_opn_harmonized = topmri_opn_harmonized.drop(columns=['site', 'GMM',])
#top_opn_harmonized.head(3)

In [None]:
topmri_opn_harmonized = topmri_opn_harmonized.merge(TOPMRI_semi_features, on="participant_id")
edis_opn_harmonized   = edis_opn_harmonized.merge(EDIS_semi_features, on="participant_id") 

In [None]:
topmri_opn_harmonized

In [None]:
topmri_opn_harmonized.to_csv('harm_results/open_nested_combat/topmri_opn_harmonized_to_edis.csv')
edis_opn_harmonized.to_csv('harm_results/open_nested_combat/edis_opn_harmonized.csv')

In [None]:
edis_opn_harmonized.head(3)

<!-- # split dataframe back up into parts for running
## from complete_harmonised -->