In [None]:
import os
import sys

import pandas as pd
import numpy as np

from neuroCombat import neuroCombat

sys.path.insert(0, '../../') # path to functions

import cvasl.harmony as har

In [None]:
# Datasets for this work
EDIS_path = '../our_datasets/EDIS/'
HELIUS_path = '../our_datasets/HELIUS/'
Insight46_path = '../our_datasets/Insight46/'
HELIUS_path = '../our_datasets/HELIUS/'
SABRE_path = '../our_datasets/SABRE/'
MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
EDIS_file = os.path.join(EDIS_path, file_name)
SABRE_file = os.path.join(SABRE_path, file_name)
Insight46_file = os.path.join(Insight46_path, file_name)
HELIUS_file = os.path.join(HELIUS_path, file_name)

TOP = pd.read_csv(TOP_file, index_col=0 )
MRI = pd.read_csv(MRI_file, index_col=0)
HELIUS = pd.read_csv(HELIUS_file, index_col=0) # 
EDIS = pd.read_csv(EDIS_file, index_col=0 )
SABRE = pd.read_csv(SABRE_file, index_col=0)
INSI = pd.read_csv(Insight46_file, index_col=0) # 

In [None]:
HELIUS[HELIUS['participant_id']=='sub-153852_1']

In [None]:
SABRE[SABRE['participant_id']=='sub-153852_1']

In [None]:
HELIUS.loc[HELIUS['participant_id']=='sub-153852_1', 'participant_id'] = 'sub-153852_1H'

In [None]:
HELIUS.Sex.unique()

In [None]:
HELIUS.head(3)

In [None]:
sex_mapping = {1:0,2:1}

EDIS = EDIS.assign(Sex = EDIS.Sex.map(sex_mapping))
EDIS.head(3)

In [None]:
INSI = INSI.assign(Sex = INSI.Sex.map(sex_mapping))
INSI.head(3)

In [None]:
SABRE = SABRE.assign(Sex = SABRE.Sex.map(sex_mapping))
SABRE.head(3)

In [None]:
TOP = TOP.drop(['Site', 'ID'], axis= 1)
TOP.head(3)

In [None]:
MRI = MRI.drop(['Site', 'ID'],axis = 1)
MRI.tail(3)

In [None]:
TOPMRI = pd.concat([MRI ,TOP])
TOPMRI.head(3)

In [None]:
HELIUS = HELIUS.drop(['Site', 'ID'],axis = 1)
HELIUS.tail(3)

In [None]:
EDIS = EDIS.drop(['Site', 'ID'],axis = 1)
INSI = INSI.drop(['Site', 'ID'],axis = 1)
SABRE = SABRE.drop(['Site', 'ID'],axis = 1)

In [None]:

def prep_for_neurocombat_5way(dataframe1, dataframe2, dataframe3, dataframe4, dataframe5):
    """
    This function takes five dataframes in the cvasl format,
    then turns them into the items needed for the
    neurocombat algorithm with re-identification.

    :param dataframe1: frame variable
    :type frame: `~pandas.DataFrame`
    :param dataframe2: frame variable
    :type frame: `~pandas.DataFrame`

    :returns: dataframes for neurocombat algorithm and ints of some legnths
    :rtype: tuple
    """
    # TODO:(makeda) make so it can take frame name or frame
    
    two_selection = dataframe2
    one_selection = dataframe1
    three_selection = dataframe3
    four_selection = dataframe4
    five_selection = dataframe5
    #one_selection = dataframe1
    
    # set index to participant IDs
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    three_selection= three_selection.set_index('participant_id')
    four_selection = four_selection.set_index('participant_id')
    five_selection = five_selection.set_index('participant_id')
    
    #turn dataframes on side
    one_selection = one_selection.T
    two_selection = two_selection.T
    three_selection =three_selection.T
    four_selection = four_selection.T
    five_selection = five_selection.T
    
    # concat the two dataframes
    all_togetherF = pd.concat(
        [one_selection, two_selection, three_selection, four_selection, five_selection],
        axis=1,
        join="inner",
    )
    
    #print("Nan count", both_togetherF.isna().sum().sum())
    # create a feautures only frame (no age, no sex)
    features_only = all_togetherF[2:]
    ##print(features_only)
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = []
    made_vals = []
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number += 1
    feature_dictF = dict(map(lambda i, j: (i, j), made_keys, made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = all_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    len3 = len(three_selection.columns)
    len4 = len(four_selection.columns)
    len5 = len(five_selection.columns)
    
    return all_togetherF, ftF, btF, feature_dictF, len1, len2, len3, len4, len5

In [None]:
all_togetherF, ftF, btF, feature_dictF, len1, len2, len3, len4, len5 = prep_for_neurocombat_5way(TOPMRI, HELIUS, EDIS, SABRE, INSI)

In [None]:
all_togetherF, ftF, btF, feature_dictF, len1, len2, len3, len4, len5 = har.prep_for_neurocombat_5way(TOPMRI, HELIUS, EDIS, SABRE, INSI)

ftF is what is stripped down to the pure array

In [None]:
all_togetherF

In [None]:
#both_togetherF

In [None]:
ftF.to_csv('ftF_top5.csv')

In [None]:
data = np.genfromtxt('ftF_top5.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
data.shape

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
second_columns_as_two = [2] * len2
third_columns_as_three = [3] * len3
fourth_columns_as_four = [4] * len4
fifth_columns_as_five = [5] * len5
covars = {'batch':first_columns_as_one + second_columns_as_two + third_columns_as_three + fourth_columns_as_four + fifth_columns_as_five,
          #'Site': both_togetherF.loc['Site',:].values.tolist(), 
          'Sex':all_togetherF.loc['Sex',:].values.tolist(),
           'Age':all_togetherF.loc['Age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
# specifify sex as categorical
categorical_cols = ['Sex']
# To specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['Age']
#Harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

# (n_dat, covars, batch_col, continuous_cols=continuous_cols,
#                                     categorical_cols=categorical_cols)

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat

In [None]:
topperF = har.make_topper(btF,'Age', 'Sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
len5

In [None]:
neuro_harm_topmri =back_together.head(len1)
neuro_harm_HELIUS =back_together.head(len1+len2).tail(len2)
neuro_harm_EDIS =back_together.head(len1+len2+ len3).tail(len3)
neuro_harm_SABRE =back_together.head(len1+len2+ len3 +len4).tail(len4)
neuro_harm_INSI =back_together.head(len1+len2+ len3 +len4+ len5).tail(len5)

In [None]:
neuro_harm_INSI

In [None]:
neuro_harm_EDIS

In [None]:
neuro_harm_SABRE

In [None]:
neuro_harm_topmri