# Open nested combat: TOP and StrokeMRI open nested-harmonized datasets

Note this must be run in the `opnc` environment

## import libraries

In [None]:
import os
import sys
from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

#import neuroCombat as nC
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from scipy.stats import ranksums, ttest_ind, ttest_rel, ks_2samp, anderson_ksamp

sys.path.insert(0, '../../') # path to cvasl functions
import cvasl.seperated as sep
import cvasl.harmony as har
import cvasl.vendor.open_nested_combat.nest as nest
import cvasl.vendor.neurocombat as nC

## Read in our data

In [None]:

MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)

In [None]:
mri = pd.read_csv(MRI_file, index_col=0)
top = pd.read_csv(TOP_file, index_col=0)

In [None]:
top.columns = top.columns.str.lower()
top = top.drop(['site', 'id'], axis= 1)
top.head(3)

In [None]:
mri.columns = mri.columns.str.lower()
mri = mri.drop(['site', 'id'], axis= 1)
mri.tail(3)

## Below are functions out of the Hannah Horng Opn-combat library
The library is here https://github.com/hannah-horng/opnested-combat
As the library is unreleased and unversioned, we are using the MIT lisenced functions directly to version control them

In [None]:
top_batch_testing_df = top[['participant_id','age', 'sex']]
top_batch_testing_df['site'] = 0
column_to_move = top_batch_testing_df.pop("site")
top_batch_testing_df.insert(1, "site", column_to_move)
top_batch_testing_df.head(3)

In [None]:
mri_batch_testing_df = mri[['participant_id','age', 'sex']]
mri_batch_testing_df['site'] = 1
column_to_move = mri_batch_testing_df.pop("site")
mri_batch_testing_df.insert(1, "site", column_to_move)
mri_batch_testing_df.head(3)

In [None]:
batch_testing_df = pd.concat([top_batch_testing_df, mri_batch_testing_df], ignore_index=True)
#batch_testing_df = sep.recode_sex_to_numeric(batch_testing_df)
batch_testing_df.tail(5)

In [None]:
# Loading in batch effects
batch_testing_list = ['site']
# Loading in clinical covariates
categorical_testing_cols = ['sex']
continuous_testing_cols = ['age']

In [None]:
# ASL
data_testing_df = pd.concat([top,mri])
#data_testing_df = # ASL
data_testing_df = data_testing_df.drop(columns=['age','sex'])  
data_testing_df.tail(3)

In [None]:
#data_testing_df# = sep.recode_sex_to_numeric(data_testing_df)

In [None]:
#caseno_testing 

In [None]:
data_testing_df = data_testing_df.reset_index(drop=True)
data_testing_df = data_testing_df.dropna()
data_testing_df = data_testing_df.merge(batch_testing_df['participant_id'], 
                                        left_on='participant_id', right_on='participant_id')
dat_testing = data_testing_df.iloc[:, 1:]
dat_testing = dat_testing.T.apply(pd.to_numeric)
caseno_testing = data_testing_df['participant_id']
covars_testing = batch_testing_df.drop('participant_id',axis=1)

In [None]:
data_testing_df.columns

In [None]:
# Merging batch effects, clinical covariates
covars_testing_string = pd.DataFrame()
covars_testing_string[categorical_testing_cols] = covars_testing[categorical_testing_cols].copy()
covars_testing_quant = covars_testing[continuous_testing_cols]
#covars_testing_quant

In [None]:
# Encoding categorical variables
covars_testing_cat = pd.DataFrame()
for col_testing in covars_testing_string:
    stringcol_testing = covars_testing_string[col_testing]
    le = LabelEncoder()
    le.fit(list(stringcol_testing))
    covars_testing_cat[col_testing] = le.transform(stringcol_testing)
#covars_testing_cat

In [None]:
covars_testing_final = pd.concat([covars_testing_cat, covars_testing_quant], axis=1)
#covars_testing_final

In [None]:
# # # FOR GMM COMBAT VARIANTS:
# # # Adding GMM Split to batch effects
filepath2 = 'Testing/OPPNComBat/ResultTesting'
if not os.path.exists(filepath2):
    os.makedirs(filepath2)
gmm_testing_df = nest.GMMSplit(dat_testing, caseno_testing, filepath2)

In [None]:
batch_testing_df.head(2)

In [None]:
gmm_testing_df.head(2)

In [None]:
gmm_testing_df_merge = batch_testing_df.merge(gmm_testing_df, right_on='Patient', left_on='participant_id')
gmm_testing_df_merge['GMM'] = gmm_testing_df_merge['Grouping'] 
gmm_testing_df_merge

In [None]:
covars_testing_final = gmm_testing_df_merge.drop(['participant_id','Patient','Grouping'],axis=1)
categorical_testing_cols = categorical_testing_cols + ['GMM']

In [None]:
output_testing_df = nest.OPNestedComBat(dat_testing,
                                   covars_testing_final,
                                   batch_testing_list,
                                   filepath2, categorical_cols=categorical_testing_cols,
                                  continuous_cols=continuous_testing_cols)

In [None]:
write_testing_df = pd.concat([caseno_testing, output_testing_df], axis=1) 
write_testing_df.to_csv(filepath2+'/Mfeatures_testing_NestedComBat.csv') # write results fo file
dat_testing_input = dat_testing.transpose()
dat_testing_input.to_csv(filepath2+'/Mfeatures_input_testing_NestedComBat.csv')
covars_testing_final.to_csv(filepath2+'/Mcovars_input_testing_NestedComBat.csv')

In [None]:
# write harmonized dataset 
complete_harmonised = pd.concat([write_testing_df, covars_testing_final], axis=1) 
complete_harmonised.head(3)

In [None]:
complete_harmonised.columns

# split dataframe back up into parts for running
## from complete_harmonised

In [None]:
mri_opn_harmonized = complete_harmonised[complete_harmonised['site'] == 1]
mri_opn_harmonized = mri_opn_harmonized.drop(columns=['site', 'GMM',])
mri_opn_harmonized.head(3)

In [None]:
top_opn_harmonized = complete_harmonised[complete_harmonised['site'] == 0]
top_opn_harmonized = top_opn_harmonized.drop(columns=['site', 'GMM',])
#top_opn_harmonized.head(3)

In [None]:
top_opn_harmonized.to_csv('harm_results/open_nested_combat/top_opn_harmonized.csv')
mri_opn_harmonized.to_csv('harm_results/open_nested_combat/mri_opn_harmonized.csv')

In [None]:
mri_opn_harmonized.head(3)

In [None]:
mri.columns

In [None]:
mri_opn_harmonized.columns

In [None]:
complete_harmonised.columns

In [None]:
complete_nonharmonised = pd.concat([caseno_testing, dat_testing_input], axis=1) 
complete_nonharmonised = pd.concat([complete_nonharmonised, covars_testing_final], axis=1) 
complete_nonharmonised.columns

In [None]:
complete_harmonised['harmonization'] = 'H'

complete_nonharmonised['harmonization'] = 'UH'

In [None]:
feature_list = ['gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf']
har.compare_harm_multi_site_violins(
        complete_nonharmonised,
        complete_harmonised,
        feature_list
)

<!-- # split dataframe back up into parts for running
## from complete_harmonised -->