# let's make a datframe split a couple ways by age, then see how harmonization with neurocombat changes it



## import libraries

In [None]:
import os

import pandas as pd
import numpy as np
from neuroCombat import neuroCombat

## prepare data

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

In [None]:
StrokeMRI  = pd.read_csv(filename_mri)
#StrokeMRI = StrokeMRI.drop(StrokeMRI.columns[0],axis=1)
#sex_mapping = {'F':0,'M':1}
#StrokeMRI = StrokeMRI.assign(sex = StrokeMRI.sex.map(sex_mapping))
StrokeMRI.head(3)

In [None]:
StrokeMRI.age.describe()

In [None]:
def split_frame_half_balanced_by_column(frame, column):
    """This is useful in cases where you want to split on a columns
    with continous values e.g. age."""
    df = frame.sort_values(column).reset_index()
    rng_even = range(0, len(df),2)
    rng_odd =  range(1, len(df),2)
    even_rows = df.iloc[rng_even]
    odd_rows = df.iloc[rng_odd]
    return even_rows, odd_rows

In [None]:
stroke_even_unharmonized = split_frame_half_balanced_by_column(StrokeMRI, 'age')[0]
stroke_odd_unharmonized = split_frame_half_balanced_by_column(StrokeMRI, 'age')[1]

In [None]:
stroke_even_unharmonized= stroke_even_unharmonized.drop('index', axis=1)
stroke_odd_unharmonized = stroke_odd_unharmonized.drop('index', axis=1)

In [None]:
stroke_even_unharmonized.to_csv('stroke_even_unharmonized.csv')
stroke_odd_unharmonized.to_csv('stroke_odd_unharmonized.csv') 

In [None]:
StrokeMRI.head(2)

In [None]:
stroke_even_unharmonized.head(2)

In [None]:
#pd.read_csv('stroke_even_unharmonized.csv')

In [None]:
def top_and_bottom_by_column(frame, column):
    """This is useful in cases where you want to split on a columns
    with continous values e.g. age.; and upi
    want the highest and lowest values seperated"""
    df = frame.sort_values(column)#.reset_index()
    len_first_half= len(df) // 2
    top = df.iloc[:len_first_half]
    bottom = df.iloc[len_first_half :]
    return top, bottom

In [None]:
stroke_top_unharmonized = top_and_bottom_by_column(StrokeMRI, 'age')[0]
stroke_bottom_unharmonized = top_and_bottom_by_column(StrokeMRI, 'age')[1]

In [None]:
#stroke_bottom_unharmonized= stroke_bottom_unharmonized.drop('index', axis=1)
#stroke_top_unharmonized= stroke_top_unharmonized.drop('index', axis=1)

In [None]:
stroke_top_unharmonized.to_csv('stroke_top_unharmonized.csv')
stroke_bottom_unharmonized.to_csv('stroke_bottom_unharmonized.csv') 

In [None]:
stroke_top_unharmonized.head(3)

In [None]:
def prep_for_neurocombat(dataframename1, dataframename2):
    """
    This function takes two dataframes in the cvasl format,
    then turns them into three items needed for the 
    neurocombat algorithm with re-identification.
    
    """
    # our_1_frame = pd.read_csv(dataframename1)
    # our_2_frame = pd.read_csv(dataframename2)
    two_selection = dataframename2.drop(['Unnamed: 0'],axis=1)
    one_selection = dataframename1.drop(['Unnamed: 0'],axis=1)
    # two_selection = our_2_frame.drop(['Unnamed: 0.1'],axis=1)
    # one_selection = our_1_frame.drop(['Unnamed: 0.1'],axis=1)
    one_selection = one_selection.set_index('participant_id')
    two_selection = two_selection.set_index('participant_id')
    one_selection = one_selection.T
    two_selection = two_selection.T
    both_togetherF = pd.concat([one_selection, two_selection], axis=1, join="inner")
    print("Nan count",both_togetherF.isna().sum().sum())
    features_only = both_togetherF[2:]
    dictionary_features_len = len(features_only.T.columns)
    number = 0
    made_keys = [] 
    made_vals = [] 
    for n in features_only.T.columns:

        made_keys.append(number)
        made_vals.append(n)
        number +=1
    feature_dictF = dict(map(lambda i,j : (i,j) , made_keys,made_vals))
    ftF = features_only.reset_index()
    ftF = ftF.rename(columns={"index": "A"})
    ftF = ftF.drop(['A'], axis=1)
    ftF = ftF.dropna()
    btF = both_togetherF.reset_index()
    btF = btF.rename(columns={"index": "A"})
    btF = btF.drop(['A'], axis=1)
    btF = btF.dropna()
    len1 = len(one_selection.columns)
    len2 = len(two_selection.columns)
    return both_togetherF, ftF, btF, feature_dictF, len1, len2

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = prep_for_neurocombat(stroke_even_unharmonized, stroke_odd_unharmonized)

In [None]:
ftF.head(3)

In [None]:
ftF.to_csv('ftF1_UH.csv')

In [None]:
data = np.genfromtxt('ftF1_UH.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
covars

In [None]:
data

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# to specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
# harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat.head(2)

In [None]:
def make_topper(btF, row0,row1):
    """
    This function makes top rows for something harmonized
    out of the btF part produced with prep_for_neurocombat(dataframename1, dataframename2)
    """
    topperF = btF.head(2)
    topperF = topperF.rename_axis(None, axis="columns")
    topperF = topperF.reset_index(drop=False)
    topperF = topperF.rename(columns={"index": "char"})
    topperF['char'][0] = row0#'age'
    topperF['char'][1] = row1#'sex'
    return topperF

In [None]:
topperF = make_topper(btF,'age', 'sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
odds_harmonized = back_together.tail(len2)
evens_harmonized = back_together.head(len1)

In [None]:
#evens

In [None]:
evens_harmonized = evens_harmonized.rename(feature_dictF, axis='columns')
odds_harmonized= odds_harmonized.rename(feature_dictF, axis='columns')

Save off to harmonized csv files

In [None]:
evens_harmonized

In [None]:
# evens_harmonized.to_csv('evens_harmonized.csv')
# odds_harmonized.to_csv('odds_harmonized.csv')

Now we can analize the difference between these two sets:

In [None]:
evens_harmonized.iloc[:,2:] 

In [None]:
stroke_even_unharmonized.iloc[:,4:] 

In [None]:
both_togetherF, ftF2, btF, feature_dictF, len1, len2 = prep_for_neurocombat('stroke_top_unharmonized.csv', 'stroke_bottom_unharmonized.csv')

In [None]:
ftF.to_csv('ftF2_UH.csv')

In [None]:
data2 = np.genfromtxt('ftF2_UH.csv', delimiter=",", skip_header=1)
data2 = data2[:, 1:]
data2