# Autocombat TOP and StrokeMRI harmonized datasets

Note this must be run in the `comscan6` environment

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np

sys.path.insert(0, '../../') # path to cvasl functions

import cvasl.harmony as har
import cvasl.seperated as sep
import cvasl.vendor.comscan.neurocombat as autocombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
# Datasets for this work
MRI = pd.read_csv('../new_data/TrainingDataComplete_StrokeMRI.csv')
TOP = pd.read_csv('../new_data/TrainingDataComplete_TOP.csv')

In [None]:
TOP = TOP.drop([ 'ID'], axis= 1)
TOP['Site'] = 0
TOP.head(3)

In [None]:
MRI = MRI.drop(['ID'],axis = 1)
MRI.tail(5)

In [None]:
TOP.head(3)

In [None]:
datasets = [TOP, MRI]
new_frames = sep.deal_with_readout_and_labelling(datasets, ['M0'])

In [None]:
# make sure things are inproper order
for frame in new_frames:
    print(len(frame))
print(len(TOP))
print(len(MRI))

In [None]:
#new_frames[0]
TOP = new_frames[0]
MRI = new_frames[1]


In [None]:
new_frame_datasets=  [TOP, MRI, ]
for everyone in new_frame_datasets:
    everyone.columns =  everyone.columns.str.lower()
TOP.head(3)

In [None]:
MRI.columns = MRI.columns.str.lower()

In [None]:
TOP.columns = TOP.columns.str.lower()

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI.head(3) 

In [None]:
TOPMRI.columns = TOPMRI.columns.str.lower()

In [None]:
TOPMRI['decade']=(TOPMRI['age']/10).round()
#TOPMRI['decade']

In [None]:
TOPMRI = TOPMRI.sort_values(by='age')
TOPMRI.reset_index(inplace=True)
TOPMRI['fine_grain'] = TOPMRI['age'].rolling(2).sum()/2
#TOPMRI

In [None]:
TOPMRI[2:].fine_grain.iloc[::2] = TOPMRI[:].fine_grain.iloc[1::2]
#TOPMRI['fine_grain']

In [None]:
TOPMRI['fine_grain'][0] = TOPMRI['fine_grain'][1]
TOPMRI['fine_grain']

In [None]:
TOPMRI['decade'].unique()

In [None]:
TOPMRI.columns

In [None]:
combat = autocombat.Combat(
    features=[# 'gm_vol', 'wm_vol', 'csf_vol',
       #'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
        'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
        'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
             #'ld', 'pld', 'labelling', 'readout',],
    sites=["site"], discrete_covariates=['sex'],continuous_covariates=['decade'],)

fg_combat = autocombat.Combat(
    features=[ #'gm_vol', 'wm_vol', 'csf_vol',
       #'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',],
            # 'ld', 'pld', 'labelling', 'readout',],
    sites=[ "site"], discrete_covariates=['sex'],continuous_covariates=['fine_grain'],)


In [None]:
print(combat.fit(TOPMRI))
print(fg_combat.fit(TOPMRI))

In [None]:
print(combat.gamma_star_)
print(fg_combat.gamma_star_)

In [None]:
transformed_TOPMRI= combat.transform(TOPMRI)
transformed_TOPMRI.head(3)

In [None]:
fg_transformed_TOPMRI= fg_combat.transform(TOPMRI)
fg_transformed_TOPMRI.head(3)

In [None]:
TOPMRI.head(3)

In [None]:
transformed_TOPMRI= combat.transform(TOPMRI)
transformed_TOPMRI.head(3)

In [None]:
fg_transformed_TOPMRI= fg_combat.transform(TOPMRI)
fg_transformed_TOPMRI.head(3)

In [None]:
transformed_TOPMRI.columns

In [None]:
TOP_transformed = transformed_TOPMRI[ transformed_TOPMRI['site']==0]
TOP_transformed = TOP_transformed.drop(['site', 'decade','fine_grain', 'index'], axis=1)
TOP_transformed.head(3)

In [None]:
fg_TOP_transformed = fg_transformed_TOPMRI[ fg_transformed_TOPMRI['site']==0]
fg_TOP_transformed = fg_TOP_transformed.drop(['site', 'decade','fine_grain', 'index'], axis=1)
fg_TOP_transformed.head(3)

In [None]:
MRI_transformed = transformed_TOPMRI[ transformed_TOPMRI['site']==1]
MRI_transformed = MRI_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
MRI_transformed.head(3)

In [None]:
fg_MRI_transformed = fg_transformed_TOPMRI[fg_transformed_TOPMRI['site']==1]
fg_MRI_transformed = fg_MRI_transformed.drop(['site', 'decade', 'index', 'fine_grain'], axis=1)
fg_MRI_transformed.head(3)

In [None]:
TOP_transformed.to_csv('harm_results/autocombat/autocom_harm_top1.csv')
MRI_transformed.to_csv('harm_results/autocombat/autocom_harm_mri1.csv')

fg_TOP_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_top1.csv')
fg_MRI_transformed.to_csv('harm_results/autocombat/fg_autocom_harm_mri1.csv')

# Data quality check

In [None]:
#TOP

In [None]:
#neuro_harm_top

In [None]:
TOP_transformed.isna().sum().sum()

In [None]:
MRI_transformed.isna().sum().sum()

In [None]:
number_columns = ['sex', 'gm_vol', 'wm_vol', 'csf_vol',
       'gm_icvratio', 'gmwm_icvratio', 'wmhvol_wmvol', 'wmh_count',
       'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'aca_b_cbf', 'mca_b_cbf', 'pca_b_cbf', 'totalgm_b_cbf',]

In [None]:
#neuro_harm_top.head(3)
(TOP_transformed[number_columns] < 0).sum()

In [None]:
(MRI_transformed[number_columns] < 0).sum()

In [None]:
MRI_transformed