# Neurocombat TOP-StrokeMRI (TOPMRI) and SABRE harmonized datasets

Note this must be run in the `neuron` environment

## import libraries

In [None]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's see how we would apply this to our data
We will have to flip it on it's side to make it work

In [None]:
filepath = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
filename_top = os.path.join(filepath,'TOP_pvc2c.csv') 
filename_sabre =  os.path.join(filepath,'SABRE_pvc2_cleaned.csv') 

In [None]:
our_mri_data = pd.read_csv(filename_mri)
our_top_data = pd.read_csv(filename_top)
our_sabre_data = pd.read_csv(filename_sabre)
our_sabre_data.head(3)

In [None]:
shared_columns = our_mri_data.columns
our_sabre_data = our_sabre_data[shared_columns]
our_sabre_data.head(3)

In [None]:
our_sabre_data.isna().sum()

In [None]:
our_mri_data.head(3)

In [None]:
our_topmri_data = pd.concat([our_mri_data , our_top_data ])
our_topmri_data.head(3)

In [None]:
our_topmri_data.isna().sum()

In [None]:
our_topmri_data.to_csv('topmri_for_sabre.csv')

In [None]:
filename_topmri = 'topmri_for_sabre.csv'

In [None]:
def prep_for_neurocombat(dataframename1, dataframename2):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    our_1_frame = pd.read_csv(dataframename1)
    our_2_frame = pd.read_csv(dataframename2)
    two_selection = our_2_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = our_1_frame.drop(['Unnamed: 0'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat(filename_topmri, filename_sabre)

In [None]:
ftF.to_csv('ftF_topmri_sabre.csv')

In [None]:
data = np.genfromtxt('ftF_topmri_sabre.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [None]:
topperF = make_topper(btF,'age', 'sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
back_together.head(3)

In [None]:
neuro_harm_sabre =back_together.tail(len2)
neuro_harm_topmri =back_together.head(len1)

In [None]:
neuro_harm_topmri = neuro_harm_topmri.rename(feature_dictF, axis='columns')
neuro_harm_sabre = neuro_harm_sabre.rename(feature_dictF, axis='columns')

In [None]:
neuro_harm_topmri = neuro_harm_topmri.reset_index()
neuro_harm_topmri = neuro_harm_topmri.rename(columns={"index": "participant_id"})
neuro_harm_topmri

Save off to harmonized csv files

In [None]:
neuro_harm_sabre = neuro_harm_sabre.reset_index()
neuro_harm_sabre = neuro_harm_sabre.rename(columns={"index": "participant_id"})
neuro_harm_sabre

In [None]:
neuro_harm_topmri.to_csv('newly/neuro_harm_topmri_4sabre.csv')
neuro_harm_sabre.to_csv('newly/neuro_harm_sabre_4topmri.csv')

In [None]:
neuro_harm_topmri.isna().sum()

In [None]:
neuro_harm_sabre.isna().sum()

In [None]:
#(rerun 10/11/2023)