# Autocombat TOP and StrokeMRI vs. SABRE harmonized datasets

Note this must be run in the `comscan6` environment

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np

sys.path.insert(0, '../../') # path to functions

import cvasl.harmony as har
import cvasl.vendor.comscan.neurocombat as autocombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
# Datasets for this work
MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
SABRE_path = '../our_datasets/SABRE/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
SABRE_file = os.path.join(SABRE_path, file_name)

TOP = pd.read_csv(TOP_file, index_col=0)
MRI = pd.read_csv(MRI_file, index_col=0)
SABRE = pd.read_csv(SABRE_file, index_col=0)

In [None]:
sex_mapping = {1:0,2:1}
SABRE = SABRE.assign(Sex = SABRE.Sex.map(sex_mapping))

In [None]:
TOP = TOP.drop([ 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['ID'],axis = 1)
MRI.tail(5)

In [None]:
SABRE = SABRE.drop(['ID'],axis = 1)
SABRE.tail(5)

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI['Site'] = 0
TOPMRI.head(3) 

In [None]:
TOPMRI.columns = TOPMRI.columns.str.lower()

In [None]:
SABRE.columns = SABRE.columns.str.lower()

In [None]:
TOPMRISABRE = pd.concat([TOPMRI, SABRE])
TOPMRISABRE

In [None]:
TOPMRISABRE['decade']=(TOPMRISABRE['age']/10).round()
#TOPMRI['decade']

In [None]:
TOPMRISABRE = TOPMRISABRE.sort_values(by='age')
TOPMRISABRE.reset_index(inplace=True)
TOPMRISABRE['fine_grain'] = TOPMRISABRE['age'].rolling(2).sum()/2
#TOPMRISABEW

In [None]:
TOPMRISABRE[2:].fine_grain.iloc[::2] = TOPMRISABRE[:].fine_grain.iloc[1::2]
#TOPMRI['fine_grain']

In [None]:
TOPMRISABRE['fine_grain'][0] = TOPMRISABRE['fine_grain'][1]
TOPMRISABRE['fine_grain']

In [None]:
TOPMRISABRE['decade'].unique()

In [None]:
TOPMRISABRE.columns

In [None]:
combat = autocombat.Combat(
    features=[ 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
    sites=[ "site"], discrete_covariates=['sex'],continuous_covariates=['decade'],)

fg_combat = autocombat.Combat(
    features=[ 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
    sites=[ "site"], discrete_covariates=['sex'],continuous_covariates=['fine_grain'],)


In [None]:
print(combat.fit(TOPMRISABRE))
print(fg_combat.fit(TOPMRISABRE))

In [None]:
print(combat.gamma_star_)
print(fg_combat.gamma_star_)

In [None]:
transformed_TOPMRISABRE= combat.transform(TOPMRISABRE)
transformed_TOPMRISABRE.head(3)

In [None]:
fg_transformed_TOPMRISABRE= fg_combat.transform(TOPMRISABRE)
fg_transformed_TOPMRISABRE.head(3)

In [None]:
TOPMRISABRE.head(3)

In [None]:
transformed_TOPMRISABRE= combat.transform(TOPMRISABRE)
transformed_TOPMRISABRE.head(3)

In [None]:
fg_transformed_TOPMRISABRE= fg_combat.transform(TOPMRISABRE)
fg_transformed_TOPMRISABRE.head(3)

In [None]:
transformed_TOPMRISABRE.columns

In [None]:
TOPMRI_transformed = transformed_TOPMRISABRE[ transformed_TOPMRISABRE['site']==0]
TOPMRI_transformed = TOPMRI_transformed.drop(['site', 'decade','fine_grain', 'index'], axis=1)
TOPMRI_transformed.head(3)

In [None]:
fg_TOPMRI_transformed = fg_transformed_TOPMRISABRE[ fg_transformed_TOPMRISABRE['site']==0]
fg_TOPMRI_transformed = fg_TOPMRI_transformed .drop(['site', 'decade','fine_grain', 'index'], axis=1)
fg_TOPMRI_transformed.head(3)

In [None]:
SABRE_transformed = transformed_TOPMRISABRE[ transformed_TOPMRISABRE['site']==1]
SABRE_transformed = SABRE_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
SABRE_transformed.head(3)

In [None]:
fg_SABRE_transformed = fg_transformed_TOPMRISABRE[fg_transformed_TOPMRISABRE['site']==1]
fg_SABRE_transformed = fg_SABRE_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
fg_SABRE_transformed.head(3)

In [None]:
TOPMRI_transformed.to_csv('harm_results/autocombat/autocom_harm_topmri_v_s.csv')
SABRE_transformed.to_csv('harm_results/autocombat/autocom_harm_sabre.csv')

fg_TOPMRI_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_topmri_v_s.csv')
fg_SABRE_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_sabre1.csv')

# Data quality check

In [None]:
TOPMRI_transformed.isna().sum().sum()

In [None]:
SABRE_transformed.isna().sum().sum()

In [None]:
number_columns = ['sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',]

In [None]:
#neuro_harm_top.head(3)
(TOPMRI_transformed[number_columns] < 0).sum()

In [None]:
(SABRE_transformed[number_columns] < 0).sum()

In [None]:
SABRE_transformed

## So some variables are again problematic, becuas they are negative

 We will wait about logging it until we see other columns we may want to log across all datasets