# COVbat TOP and StrokeMRI  vs. SABRE harmonized datasets

Note this must be run in the `covbat` environment. Environment file inside vendor/covbat

## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np

import patsy

sys.path.insert(0, '../../') # path to functions

import cvasl.harmony as har
import cvasl.vendor.covbat.covbat as covbat

## import data

In [None]:
# Datasets for this work
SABRE = pd.read_csv('../new_data/TrainingDataComplete_SABRE.csv')

MRI = pd.read_csv('../new_data/TrainingDataComplete_StrokeMRI.csv')
TOP = pd.read_csv('../new_data/TrainingDataComplete_TOP.csv')


In [None]:
SABRE.head(3)

In [None]:
TOP = TOP.drop([ 'ID'], axis= 1)

TOP.head(3)

In [None]:
MRI = MRI.drop(['ID'],axis = 1)
MRI.tail(5)

In [None]:
SABRE = SABRE.drop(['ID'],axis = 1)
SABRE['Site'] = 3
SABRE.tail(5)

In [None]:
TOPMRI = pd.concat([TOP, MRI])
TOPMRI['Site'] = 2
TOPMRI.head(3) 

In [None]:
TOPMRI.tail(3)

In [None]:
to_be_harmonized_or_covar = [
    'Age', 'Sex', 'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV','DeepWM_B_CoV',
     'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'LD', 'PLD','DeepWM_B_CBF',
       'Labelling', 'Readout', 'M0','TotalGM_B_CoV',
       'TotalGM_B_CBF',
]
not_harmonized= ['GM_vol', 'WM_vol', 'CSF_vol','GM_ICVRatio', 'GMWM_ICVRatio', 'WMHvol_WMvol', 'WMH_count',
                'LD', 'PLD', 'Labelling',
       'Readout', 'M0','DeepWM_B_CoV','DeepWM_B_CBF',]
TOPMRI_semi_features = TOPMRI.drop(to_be_harmonized_or_covar,axis=1)
SABRE_semi_features = SABRE.drop(to_be_harmonized_or_covar,axis=1) 

TOPMRI = TOPMRI.drop(not_harmonized,axis=1)
SABRE = SABRE.drop(not_harmonized,axis=1) 

In [None]:
TOPMRISABRE = pd.concat([TOPMRI, SABRE])

In [None]:
phenoTOPMRISABRE = TOPMRISABRE[['participant_id','Age', 'Sex', 'Site']]
phenoTOPMRISABRE = phenoTOPMRISABRE.set_index('participant_id')
phenoTOPMRISABRE.head(3)

In [None]:
dat_TOPMRISABRE = TOPMRISABRE.set_index('participant_id')
dat_TOPMRISABRE = dat_TOPMRISABRE.T 

In [None]:
dat_TOPMRISABRE.head(3)

In [None]:
phenoTOPMRISABRE.head(3)

In [None]:
modZ = patsy.dmatrix("~ Age + Sex", phenoTOPMRISABRE, return_type="dataframe")
modZ.head(3)

In [None]:
covbatTOPMRISABRE = covbat.combat(dat_TOPMRISABRE.tail(10), phenoTOPMRISABRE['Site'],  model=modZ, numerical_covariates ="Age")

In [None]:
covbatTOPMRISABRE

In [None]:
covbatTOPMRISABRE = covbatTOPMRISABRE[2:]
covbatTOPMRISABRE

In [None]:
dat_TOPMRISABRE.head(3)

In [None]:
covbatTOPMRISABRE = pd.concat([dat_TOPMRISABRE.head(3), covbatTOPMRISABRE])
covbatTOPMRISABRE = covbatTOPMRISABRE.T
covbatTOPMRISABRE = covbatTOPMRISABRE.reset_index()

In [None]:
covbatTOPMRISABRE

In [None]:
TOPMRI_adjusted = covbatTOPMRISABRE[covbatTOPMRISABRE['Site'] == 2]
SABRE_adjusted   = covbatTOPMRISABRE[covbatTOPMRISABRE['Site'] == 3] 

In [None]:
TOPMRI_adjusted = TOPMRI_adjusted.drop('Site', axis=1)
TOPMRI_adjusted.head(3)

In [None]:
SABRE_adjusted = SABRE_adjusted.drop('Site', axis=1)
SABRE_adjusted.tail(3)

In [None]:
TOPMRI_adjusted = TOPMRI_adjusted.merge(TOPMRI_semi_features,on='participant_id')
SABRE_adjusted = SABRE_adjusted.merge(SABRE_semi_features,on='participant_id')

In [None]:
SABRE_adjusted = SABRE_adjusted.drop('Site', axis=1)
TOPMRI_adjusted = TOPMRI_adjusted.drop('Site', axis=1)

In [None]:
TOPMRI_adjusted.to_csv('harm_results/covbat/topmri_covbat_a_SABRE.csv')
SABRE_adjusted.to_csv('harm_results/covbat/sabre_covbat_a_topmri.csv')

In [None]:
TOPMRI_adjusted

In [None]:
TOPMRI_adjusted.shape

In [None]:
ultraset = set(['participant_id', 'Age', 'Sex', 'ACA_B_CoV', 'MCA_B_CoV', 'PCA_B_CoV',
       'TotalGM_B_CoV', 'ACA_B_CBF', 'MCA_B_CBF', 'PCA_B_CBF', 'TotalGM_B_CBF',
       'GM_vol', 'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio',
       'WMHvol_WMvol', 'WMH_count'])
TOP_final_set = set(TOPMRI_adjusted.columns)
ultraset-TOP_final_set
