# Autocombat TOP and StrokeMRI vs. EDIS harmonized datasets

Note this must be run in the `comscan6` environment

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np

sys.path.insert(0, '../../') # path to functions

import cvasl.harmony as har
import cvasl.vendor.comscan.neurocombat as autocombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
# Datasets for this work
MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
EDIS_path = '../our_datasets/EDIS/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
EDIS_file = os.path.join(EDIS_path, file_name)

TOP = pd.read_csv(TOP_file,index_col=0)
MRI = pd.read_csv(MRI_file,index_col=0)
EDIS = pd.read_csv(EDIS_file,index_col=0)

In [None]:
sex_mapping = {1:0,2:1}
EDIS = EDIS.assign(Sex = EDIS.Sex.map(sex_mapping))
EDIS.head(3)

In [None]:
TOP = TOP.drop([ 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['ID'],axis = 1)
MRI.tail(5)

In [None]:
EDIS = EDIS.drop(['ID'],axis = 1)
EDIS.tail(5)

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI['Site'] = 0
TOPMRI.head(3) 

In [None]:
TOPMRI.columns = TOPMRI.columns.str.lower()

In [None]:
EDIS.columns = EDIS.columns.str.lower()

In [None]:
TOPMRIEDIS = pd.concat([TOPMRI, EDIS])
TOPMRIEDIS

In [None]:
TOPMRIEDIS['decade']=(TOPMRIEDIS['age']/10).round()
#TOPMRI['decade']

In [None]:
TOPMRIEDIS = TOPMRIEDIS.sort_values(by='age')
TOPMRIEDIS.reset_index(inplace=True)
TOPMRIEDIS['fine_grain'] = TOPMRIEDIS['age'].rolling(2).sum()/2
TOPMRIEDIS

In [None]:
TOPMRIEDIS[2:].fine_grain.iloc[::2] = TOPMRIEDIS[:].fine_grain.iloc[1::2]
#TOPMRI['fine_grain']

In [None]:
TOPMRIEDIS['fine_grain'][0] = TOPMRIEDIS['fine_grain'][1]
TOPMRIEDIS['fine_grain']

In [None]:
TOPMRIEDIS['decade'].unique()

In [None]:
TOPMRIEDIS.columns

In [None]:
combat = autocombat.Combat(
    features=[ 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
        'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
        'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
    sites=[ "site"], discrete_covariates=['sex'],continuous_covariates=['decade'],)

fg_combat = autocombat.Combat(
    features=[ 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
    sites=[ "site"], discrete_covariates=['sex'],continuous_covariates=['fine_grain'],)


In [None]:
print(combat.fit(TOPMRIEDIS))
print(fg_combat.fit(TOPMRIEDIS))

In [None]:
print(combat.gamma_star_)
print(fg_combat.gamma_star_)

In [None]:
transformed_TOPMRIEDIS= combat.transform(TOPMRIEDIS)
transformed_TOPMRIEDIS.head(3)

In [None]:
fg_transformed_TOPMRIEDIS= fg_combat.transform(TOPMRIEDIS)
fg_transformed_TOPMRIEDIS.head(3)

In [None]:
TOPMRIEDIS.head(3)

In [None]:
transformed_TOPMRIEDIS= combat.transform(TOPMRIEDIS)
transformed_TOPMRIEDIS.head(3)

In [None]:
fg_transformed_TOPMRIEDIS= fg_combat.transform(TOPMRIEDIS)
fg_transformed_TOPMRIEDIS.head(3)

In [None]:
transformed_TOPMRIEDIS.columns

In [None]:
TOPMRI_transformed = transformed_TOPMRIEDIS[ transformed_TOPMRIEDIS['site']==0]
TOPMRI_transformed = TOPMRI_transformed.drop(['site', 'decade','fine_grain', 'index'], axis=1)
TOPMRI_transformed.head(3)

In [None]:
fg_TOPMRI_transformed = fg_transformed_TOPMRIEDIS[ fg_transformed_TOPMRIEDIS['site']==0]
fg_TOPMRI_transformed = fg_TOPMRI_transformed .drop(['site', 'decade','fine_grain', 'index'], axis=1)
fg_TOPMRI_transformed.head(3)

In [None]:
EDIS_transformed = transformed_TOPMRIEDIS[ transformed_TOPMRIEDIS['site']==1]
EDIS_transformed = EDIS_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
EDIS_transformed.head(3)

In [None]:
fg_EDIS_transformed = fg_transformed_TOPMRIEDIS[fg_transformed_TOPMRIEDIS['site']==1]
fg_EDIS_transformed = fg_EDIS_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
fg_EDIS_transformed.head(3)

In [None]:
TOPMRI_transformed.to_csv('harm_results/autocombat/autocom_harm_topmri_v_e.csv')
EDIS_transformed.to_csv('harm_results/autocombat/autocom_harm_EDIS.csv')

fg_TOPMRI_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_topmri_v_e.csv')
fg_EDIS_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_EDIS1.csv')

# Data quality check

In [None]:
TOPMRI_transformed.isna().sum().sum()

In [None]:
EDIS_transformed.isna().sum().sum()

In [None]:
number_columns = ['sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
        'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',]

In [None]:
#neuro_harm_top.head(3)
(TOPMRI_transformed[number_columns] < 0).sum()

In [None]:
(EDIS_transformed[number_columns] < 0).sum()

In [None]:
EDIS_transformed

## So some variables are again problematic, becuas they are negative

 We will wait about logging it until we see other columns we may want to log across all datasets