# Neurocombat harmonized synthetic datasets

Note this must be run in the `neuron` environment

## import libraries

In [None]:
import os

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

## Let's make perturbed data
We will use our example data and perturb it

In [None]:
# filepath = '../open_work/internal_results/cleaned_pvc2s/' 
# filename_mri = os.path.join(filepath,'StrokeMRI_pvc2c.csv') 
# mri= pd.read_csv(filename_mri)
# mri

In [None]:
filename_standard = os.path.join('../researcher_interface/sample_sep_values/showable_standard.csv') 
standard = pd.read_csv(filename_standard)

In [None]:
standard.head(3)

In [None]:
alt_standard = standard.copy(deep=True)
#alt_standard

In [None]:
list_participants = []

for i in range(len(alt_standard)):
    list_participants.append(i+ 100)

In [None]:
alt_standard['participant_id'] = list_participants 
#alt_standard['participant_id']


In [None]:
# alt_standard = alt_standard.drop(['session_id','run_id'], axis=1)
# standard = standard.drop(['session_id','run_id'], axis=1)
# #two_selection = our_2_frame.drop(['Unnamed: 0'],axis=1)

In [None]:
alt_standard[ 'site'] = 1
standard['site'] = 2

In [None]:
#alt_standard.isna().sum()
number_columns = ['gm_vol',
                  'wm_vol',
                  'csf_vol',
                  'gm_icv_ratio', 
                  'gmwm_icv_ratio',
                  'wmh_vol',
                  'wmh_count',
                  'cbf_gm_pvc0',
                  'cbf_gm_pvc2',
                  'cbf_wm_pvc0',
                  'cbf_wm_pvc2',
                  'cbf_aca_pvc0',
                  'cbf_mca_pvc0',
                  'cbf_pca_pvc0',
                  'cbf_aca_pvc2', 'cbf_mca_pvc2', 'cbf_pca_pvc2', 'cov_gm_pvc0',
                  'cov_gm_pvc2', 'cov_wm_pvc0', 'cov_wm_pvc2', 'cov_aca_pvc0',
                  'cov_mca_pvc0', 'cov_pca_pvc0', 'cov_aca_pvc2', 'cov_mca_pvc2',
                  'cov_pca_pvc2', 'Additional_1', 'Additional_2']

In [None]:
alt_standard[number_columns] = alt_standard[number_columns] * 10 + 2

In [None]:
standard[number_columns] = standard[number_columns] * 5 + 2

In [None]:
standard.describe()

In [None]:
alt_standard.describe()

In [None]:
def prep_for_neurocombat_synth(our_2_frame, our_1_frame):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    #our_1_frame = pd.read_csv(dataframename1)
    #our_2_frame = pd.read_csv(dataframename2)
    two_selection = our_2_frame.drop(['session_id','run_id'],axis=1)
    one_selection = our_1_frame.drop(['session_id','run_id'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat_synth(standard, alt_standard)

In [None]:
# our_mri_data = pd.read_csv(filename_mri)
# our_top_data = pd.read_csv(filename_top)

In [None]:
# # save off csv
# both_together.to_csv('both_top_mri_together.csv')

In [None]:
# # make and save of csv of features only
# features_only = both_together[2:]
# #features_only.to_csv('features_only_top_mri.csv')

In [None]:
ftF.to_csv('ftF_synth.csv')

In [None]:
data = np.genfromtxt('ftF_synth.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'

#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    categorical_cols=categorical_cols)["data"]

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [None]:
topperF = make_topper(btF,'age', 'sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
neuro_harm_top =back_together.tail(len2)
neuro_harm_mri =back_together.head(len1)

In [None]:
neuro_harm_top_synth = neuro_harm_top.rename(feature_dictF, axis='columns')
neuro_harm_mri_synth = neuro_harm_mri.rename(feature_dictF, axis='columns')

In [None]:
neuro_harm_mri_synth

In [None]:
neuro_harm_top_synth

In [None]:
# neuro_harm_mri.to_csv('neuro_harm_mri.csv')
# neuro_harm_top.to_csv('neuro_harm_top.csv')

# Investigate neurocombat results