# Negative testing: initial experiments

## let's take a cvasl datframe split a couple ways by age, then see how harmonization with combat-GAM changes it



# Important Note: this must be run in the `neurogamyplus` environment



## import libraries

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from neuroHarmonize import harmonizationLearn
import seaborn as sns

sys.path.insert(0, '../') # path to some functions
import cvasl.harmony as har

## prepare data

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 

In [None]:
StrokeMRI  = pd.read_csv(filename_mri)
StrokeMRI.head(3)

In [None]:
#sns.displot(StrokeMRI, x='age', binwidth=3, hue= 'sex')

# Plain neuro-ComBat assumes vectors of regression coefficients follow independent normal distributions and some other assumptions,  Combat-GAM GMM may leapfrog this and give better results, this is what we are looking into

In [None]:
#sns.displot(StrokeMRI, x='wmh_vol', hue= 'sex')

In [None]:
#StrokeMRI.wmh_vol.describe()

In [None]:
#sns.displot(StrokeMRI, x='wmh_count', hue= 'sex')

In [None]:
loged_StrokeMRI = har.log_out_columns(StrokeMRI, ['wmh_vol', 'wmh_count'])

In [None]:
#sns.displot(loged_StrokeMRI, x='wmh_count', hue= 'sex')

In [None]:
#sns.displot(loged_StrokeMRI, x='wmh_vol', hue= 'sex')

## We could try using log, but let's try with existing values first

In [None]:
stroke_even_unharmonized = har.split_frame_half_balanced_by_column(StrokeMRI, 'age')[0]
stroke_odd_unharmonized = har.split_frame_half_balanced_by_column(StrokeMRI, 'age')[1]

In [None]:
stroke_even_unharmonized= stroke_even_unharmonized.drop('index', axis=1)
stroke_odd_unharmonized = stroke_odd_unharmonized.drop('index', axis=1)

In [None]:
stroke_even_unharmonized.to_csv('stroke_even_unharmonized.csv')
stroke_odd_unharmonized.to_csv('stroke_odd_unharmonized.csv') 

In [None]:
stroke_even_unharmonized.tail(2)

In [None]:
stroke_top_unharmonized = har.top_and_bottom_by_column(StrokeMRI, 'age')[0]
stroke_bottom_unharmonized = har.top_and_bottom_by_column(StrokeMRI, 'age')[1]

In [None]:
stroke_top_unharmonized.to_csv('stroke_top_unharmonized.csv')
stroke_bottom_unharmonized.to_csv('stroke_bottom_unharmonized.csv') 

In [None]:
stroke_bottom_unharmonized.head(3)

In [None]:
stroke_top_unharmonized['SITE'] = 0
stroke_bottom_unharmonized['SITE'] = 1
stroke_even_unharmonized['SITE'] = 0
stroke_odd_unharmonized['SITE'] = 1 
top_bot_together = pd.concat([stroke_top_unharmonized, stroke_bottom_unharmonized])
odd_even_together =pd.concat([stroke_odd_unharmonized, stroke_even_unharmonized])
odd_even_together.head(3)

In [None]:
# recode sex
sex_mapping = {'M':0,'F':1}
top_bot_together = top_bot_together.assign(sex = top_bot_together.sex.map(sex_mapping))
odd_even_together = odd_even_together.assign(sex = odd_even_together.sex.map(sex_mapping))

In [None]:
common_features = ['gm_vol', 'wm_vol',
       'csf_vol', 'gm_ivc_ratio', 'gmwm_ivc_ratio', 'wmh_vol', 'wmh_count',
       'deepwm_b_cov', 'aca_b_cov', 'mca_b_cov', 'pca_b_cov', 'totalgm_b_cov',
       'deepwm_b', 'aca_b', 'mca_b', 'pca_b', 'totalgm_b']

In [None]:
top_bot_together_features = top_bot_together[common_features]
top_bot_together_covariates = top_bot_together[['age', 'sex','SITE']]
top_bot_together_covariates.head(3)

In [None]:
#odd_even_together
odd_even_together_features = odd_even_together[common_features]
odd_even_together_covariates = odd_even_together[['age', 'sex','SITE']]
odd_even_together_covariates.head(3)

In [None]:
top_bot_together_features_array = np.array(top_bot_together_features)
#top_bot_together_features_array


In [None]:
odd_even_together_features_array = np.array(odd_even_together_features)
#odd_even_together_features_array


In [None]:
# run harmonization and PUT the adjusted data into my_ad_data vaiable
my_model_TBT, my_data_adj_TBT = harmonizationLearn(top_bot_together_features_array, top_bot_together_covariates)

In [None]:
# run harmonization and PUT the adjusted data into my_ad_data vaiable
my_model_OET, my_data_adj_OET = harmonizationLearn(odd_even_together_features_array, odd_even_together_covariates)

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_top_bot_together = pd.DataFrame(
    my_data_adj_TBT, 
    columns = common_features
)


neuroharmonized_top_bot_together =pd.concat([neuroharmonized_top_bot_together, top_bot_together_covariates.reset_index()], axis=1)
neuroharmonized_top_bot_together = neuroharmonized_top_bot_together.drop('index', axis=1)

neuroharmonized_top_bot_together = pd.concat([neuroharmonized_top_bot_together, top_bot_together.participant_id.reset_index()], axis=1)
neuroharmonized_top_bot_together = neuroharmonized_top_bot_together.drop('index', axis=1)
neuroharmonized_top_bot_together.head(3)

In [None]:
# turn adjusted data into dataframe with column names, then add covariates, then participant IDs
neuroharmonized_odd_even_together = pd.DataFrame(
    my_data_adj_OET, 
    columns = common_features
)


neuroharmonized_odd_even_together =pd.concat([neuroharmonized_odd_even_together, odd_even_together_covariates.reset_index()], axis=1)
neuroharmonized_odd_even_together = neuroharmonized_odd_even_together.drop('index', axis=1)

neuroharmonized_odd_even_together = pd.concat([neuroharmonized_odd_even_together, odd_even_together.participant_id.reset_index()], axis=1)
neuroharmonized_odd_even_together = neuroharmonized_odd_even_together.drop('index', axis=1)
neuroharmonized_odd_even_together.head(3)

In [None]:
# create adjusted csvs
neuroharmonized_tbt_top = neuroharmonized_top_bot_together[neuroharmonized_top_bot_together.SITE == 0]
neuroharmonized_tbt_bottom = neuroharmonized_top_bot_together[neuroharmonized_top_bot_together.SITE == 1] 

neuroharmonized_oet_odd = neuroharmonized_odd_even_together[neuroharmonized_odd_even_together.SITE == 1]
neuroharmonized_oet_even= neuroharmonized_odd_even_together[neuroharmonized_odd_even_together.SITE == 0] 

neuroharmonized_oet_odd = neuroharmonized_oet_odd.drop(['SITE'],axis=1) 
neuroharmonized_oet_even = neuroharmonized_oet_even.drop(['SITE'],axis=1)

In [None]:
sex_mapping = {'M':0,'F':1}
stroke_even_unharmonized = stroke_even_unharmonized.assign(sex = stroke_even_unharmonized.sex.map(sex_mapping))
stroke_even_unharmonized.head(3)


In [None]:
neuroharmonized_oet_even.head(3)

In [None]:
stroke_even_unharmonized[common_features].reset_index() - neuroharmonized_oet_even[common_features].reset_index()
#(stroke_even_unharmonized.reset_index()['age'].values  - evens_harmonized['age'].values).sum()#- neuroharmonized_oet_even[common_features]

In [None]:
neuroharmonized_oet_even[common_features]

In [None]:
the rest is junk note to be removed but readjusted

## Let's try using the logged vals as they are closer to normally distributed

In [None]:
stroke_even_unharmonized = har.split_frame_half_balanced_by_column(loged_StrokeMRI, 'age')[0]
stroke_odd_unharmonized = har.split_frame_half_balanced_by_column(loged_StrokeMRI, 'age')[1]

In [None]:
stroke_even_unharmonized= stroke_even_unharmonized.drop('index', axis=1)
stroke_odd_unharmonized = stroke_odd_unharmonized.drop('index', axis=1)

In [None]:
stroke_even_unharmonized.to_csv('stroke_even_unharmonized.csv')
stroke_odd_unharmonized.to_csv('stroke_odd_unharmonized.csv') 

In [None]:
stroke_even_unharmonized.tail(6)

In [None]:
stroke_top_unharmonized = har.top_and_bottom_by_column(loged_StrokeMRI, 'age')[0]
stroke_bottom_unharmonized = har.top_and_bottom_by_column(loged_StrokeMRI, 'age')[1]

In [None]:
stroke_top_unharmonized.to_csv('stroke_top_unharmonized.csv')
stroke_bottom_unharmonized.to_csv('stroke_bottom_unharmonized.csv') 

In [None]:
stroke_bottom_unharmonized.head(6)

In [None]:
both_togetherF, ftF, btF, feature_dictF, len1, len2 = har.prep_for_neurocombat(stroke_even_unharmonized, stroke_odd_unharmonized)

In [None]:
ftF.head(3)

In [None]:
ftF.to_csv('ftF1_UH.csv')

In [None]:
data = np.genfromtxt('ftF1_UH.csv', delimiter=",", skip_header=1)
data = data[:, 1:]
data

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len1
last_columns_as_two = [2] * len2
covars = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF.loc['sex',:].values.tolist(),
           'age':both_togetherF.loc['age',:].values.tolist(),} 
covars = pd.DataFrame(covars) 

In [None]:
covars

In [None]:
data

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# to specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
# harmonization step:
data_combat = neuroCombat(dat=data,
    covars=covars,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

In [None]:
neurocombat = pd.DataFrame(data_combat)
neurocombat.head(2)

In [None]:
topperF = har.make_topper(btF,'age', 'sex')

In [None]:
bottom = neurocombat.reset_index(drop=False)
bottom = bottom.rename(columns={"index": "char"})
bottom.columns = topperF.columns

In [None]:
back_together = pd.concat([topperF, bottom])
back_together = back_together.T
#back_together

In [None]:
new_header = back_together.iloc[0] #grab the first row for the header
back_together.columns = new_header #set the header row as the df header
back_together = back_together[1:]
#back_together

In [None]:
odds_harmonized = back_together.tail(len2)
evens_harmonized = back_together.head(len1)

In [None]:
evens_harmonized.head(3)

In [None]:
evens_harmonized = evens_harmonized.rename(feature_dictF, axis='columns')
odds_harmonized= odds_harmonized.rename(feature_dictF, axis='columns')

Save off to harmonized csv files

In [None]:
evens_harmonized.head(3)

In [None]:
# evens_harmonized.to_csv('evens_harmonized.csv')
# odds_harmonized.to_csv('odds_harmonized.csv')

Now we can analize the difference between these two sets:

In [None]:
#evens_harmonized.iloc[:,2:] 
stroke_even_unharmonized.head(3)

In [None]:
stroke_even_unharmonized.iloc[:,4:] 

## We do see some differences, especially in the wmh volume, but n othing over 100%  at first glance

In [None]:
both_togetherF2, ftF2, btF2, feature_dictF2, slen1, slen2 = har.prep_for_neurocombat(stroke_top_unharmonized, stroke_bottom_unharmonized)

In [None]:
ftF2.to_csv('ftF2_UH.csv')

In [None]:
data2 = np.genfromtxt('ftF2_UH.csv', delimiter=",", skip_header=1)
data2 = data2[:, 1:]
data2

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * slen1
last_columns_as_two = [2] * slen2
covars2 = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF2.loc['sex',:].values.tolist(),
           'age':both_togetherF2.loc['age',:].values.tolist(),} 
covars2 = pd.DataFrame(covars2) 

In [None]:
covars2

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# to specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
# harmonization step:
data_combat2 = neuroCombat(dat=data2,
    covars=covars2,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

In [None]:
neurocombat2 = pd.DataFrame(data_combat2)
neurocombat2.head(2)

In [None]:
topperF2 = har.make_topper(btF2,'age', 'sex')

In [None]:
bottom2 = neurocombat2.reset_index(drop=False)
bottom2 = bottom2.rename(columns={"index": "char"})
bottom2.columns = topperF2.columns

In [None]:
back_together2 = pd.concat([topperF2, bottom2])
back_together2 = back_together2.T
#back_together2

In [None]:
new_header2 = back_together2.iloc[0] #grab the first row for the header
back_together2.columns = new_header2 #set the header row as the df header
back_together2 = back_together2[1:]
#back_together2

In [None]:
bottom_harmonized = back_together2.tail(slen2)
top_harmonized = back_together2.head(slen1)

In [None]:
top_harmonized = top_harmonized.rename(feature_dictF2, axis='columns')
bottom_harmonized= bottom_harmonized.rename(feature_dictF2, axis='columns')

Save off to harmonized csv files

In [None]:
top_harmonized = top_harmonized.reset_index()
top_harmonized= top_harmonized.rename(columns= {'index': 'participant_id'})
top_harmonized

In [None]:
# change names of top and bottom so not confusing

In [None]:
top_harmonized['age'].isna().sum()

In [None]:
#stroke_top_unharmonized

In [None]:
stroke_top_unharmonized = stroke_top_unharmonized.drop('Unnamed: 0', axis=1)#.iloc[:,4:] 
#stroke_top_unharmonized = stroke_top_unharmonized.reset_index('participant_id')
stroke_top_unharmonized.head(3)

## So we see in this case we got a ten fold difference on the wmh column...
# This deserves a bit more investigation, and some graphing IMHO
note this shows that algorithms that work well positively (correcting inherently different sets) do nor always work well on the negative test (not over-correcting well on similar or the same datasets )

In [None]:
#stroke_top_unharmonized.merge(top_harmonized, on= 'participant_id')

In [None]:
stroke_top_unharmonized['age'].isna().sum()

# rename variables so there is only one TOP

In [None]:
top_harmonized['age'].isna().sum()

In [None]:
# tso do put 
(stroke_even_unharmonized.reset_index()['age'].values  - evens_harmonized['age'].values).sum()

In [None]:
stroke_even_unharmonized.reset_index()['age'].plot(color='blue', alpha= 0.5)
evens_harmonized['age'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_even_unharmonized.reset_index()['wmh_vol'].describe()

In [None]:
differences = (stroke_even_unharmonized.reset_index()['wmh_vol'].values - evens_harmonized['wmh_vol'].values)#.describe()

In [None]:
diff_frame = pd.DataFrame(differences)
diff_frame.mean()

In [None]:
plt.plot(diff_frame)

In [None]:
# floating pont differences should be 10 to the minus 8 or 10 to the minus 16

In [None]:
## use creators test case, and see if we get the same kinds of results

In [None]:
stroke_even_unharmonized.reset_index()['wmh_vol'].plot(color='blue', alpha= 0.5)
evens_harmonized['wmh_vol'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_even_unharmonized.reset_index()['wmh_count'].plot(color='blue', alpha= 0.5)
evens_harmonized['wmh_count'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_top_unharmonized.reset_index()['age'].plot(color='blue', alpha= 0.5)
top_harmonized['age'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_top_unharmonized.reset_index()['wmh_vol'].plot(color='blue', alpha= 0.5)
top_harmonized['wmh_vol'].plot(color= 'red', alpha= 0.5)

In [None]:
difference = (stroke_top_unharmonized.reset_index()['wmh_vol'].values - top_harmonized['wmh_vol'].values)
difference

In [None]:
plt.plot(difference, stroke_top_unharmonized.reset_index()['wmh_vol'].values, '.')

In [None]:
stroke_top_unharmonized.reset_index()['wmh_vol'].values - top_harmonized['wmh_vol'].values

In [None]:
stroke_bottom_unharmonized.reset_index()['wmh_vol'].plot(color='blue', alpha= 0.5)
bottom_harmonized['wmh_vol'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_top_unharmonized.reset_index()['wmh_count'].plot(color='blue', alpha= 0.5)
top_harmonized['wmh_count'].plot(color= 'red', alpha= 0.5)

In [None]:
stroke_bottom_unharmonized.reset_index()['wmh_count'].plot(color='blue', alpha= 0.5)
bottom_harmonized['wmh_count'].plot(color= 'red', alpha= 0.5)

In [None]:
bottom_harmonized['wmh_count'].hist(alpha = 0.5)

top_harmonized['wmh_count'].hist(alpha = 0.5)

## This is not the whole story, we must apply logarithm after the data split...then do again

then the features may not pull apart....


In [None]:
stroke_top_unharmonized_no_log = har.top_and_bottom_by_column(StrokeMRI, 'age')[0]
stroke_bottom_unharmonized_no_log = har.top_and_bottom_by_column(StrokeMRI, 'age')[1]

In [None]:
stroke_top_unharmonized_proper_log = har.log_out_columns(stroke_top_unharmonized_no_log, ['wmh_vol', 'wmh_count'])
stroke_bottom_unharmonized_proper_log = har.log_out_columns(stroke_bottom_unharmonized_no_log, ['wmh_vol', 'wmh_count'])

In [None]:
both_togetherF3, ftF3, btF3, feature_dictF3, len13, len23 = har.prep_for_neurocombat(
    stroke_top_unharmonized_proper_log,
    stroke_bottom_unharmonized_proper_log)

In [None]:
ftF3.to_csv('ftF3_UH.csv')

In [None]:
data3 = np.genfromtxt('ftF3_UH.csv', delimiter=",", skip_header=1)
data3 = data3[:, 1:]
data3

In [None]:
# Specifying the batch (scanner variable) as well as a sex covariate to preserve:
first_columns_as_one = [1] * len13
last_columns_as_two = [2] * len23
covars3 = {'batch':first_columns_as_one + last_columns_as_two,
          'sex':both_togetherF3.loc['sex',:].values.tolist(),
           'age':both_togetherF3.loc['age',:].values.tolist(),} 
covars3 = pd.DataFrame(covars3) 

In [None]:
covars3

In [None]:
# specifify sex as categorical
categorical_cols = ['sex']
# to specify the name of the variable that encodes for the scanner/batch covariate:
batch_col_mine = 'batch'
our_continuous_col=['age']
# harmonization step:
data_combat3 = neuroCombat(dat=data3,
    covars=covars3,
    batch_col=batch_col_mine,
    continuous_cols=our_continuous_col,
    categorical_cols=categorical_cols)["data"]

In [None]:
neurocombat3 = pd.DataFrame(data_combat3)
neurocombat3.head(2)

In [None]:
topperF3 = har.make_topper(btF3,'age', 'sex')

In [None]:
bottom3 = neurocombat3.reset_index(drop=False)
bottom3 = bottom3.rename(columns={"index": "char"})
bottom3.columns = topperF3.columns

In [None]:
back_together3 = pd.concat([topperF3, bottom3])
back_together3 = back_together3.T
#back_together3

In [None]:
new_header3 = back_together3.iloc[0] #grab the first row for the header
back_together3.columns = new_header3 #set the header row as the df header
back_together3 = back_together3[1:]
#back_together2

In [None]:
bottom_pl_harmonized = back_together3.tail(len23)
top_pl_harmonized = back_together3.head(len13)

In [None]:
top_pl_harmonized = top_pl_harmonized.rename(feature_dictF3, axis='columns')
bottom_pl_harmonized= bottom_pl_harmonized.rename(feature_dictF3, axis='columns')

Save off to harmonized csv files?

In [None]:
stroke_top_unharmonized_proper_log

In [None]:
#StrokeMRI['wmh_vol'].describe()

In [None]:
#stroke_top_unharmonized_proper_log['wmh_vol'].describe()

In [None]:
# top_pl_harmonized['wmh_vol'].min()

In [None]:
stroke_top_unharmonized_proper_log.reset_index()['wmh_vol'].plot(color='blue', alpha= 0.5)
top_pl_harmonized['wmh_vol'].plot(color= 'red', alpha= 0.5)

In [None]:
# harmonized_dfs = [top_pl_harmonized, bottom_pl_harmonized]
# complete_harmonised = pd.concat(harmonized_dfs)
# complete_harmonised 

In [None]:
stroke_top_unharmonized_proper_log.head(3)

In [None]:
top_pl_unharm_order = stroke_top_unharmonized_proper_log.drop('Unnamed: 0', axis=1)
top_pl_unharm_order['harmonization'] ="UH"
top_pl_unharm_order['site'] = "s"
top_pl_unharm_order.head(3)

In [None]:
top_pl_harm_order = top_pl_harmonized.reset_index()
top_pl_harm_order = top_pl_harm_order.rename(columns={'index': 'participant_id'})
top_pl_harm_order['harmonization'] = "H"
top_pl_harm_order['site'] = "s"
top_pl_harm_order.head(3)

The below cell is added as a hypothetical to demo graphing capabilities

In [None]:
top_pl_harm_order["batch"] = 1
top_pl_unharm_order["batch"] = 1
top_pl_harm_order[:40]["batch"] = 2
top_pl_unharm_order[:40]["batch"] = 2

In [None]:
def compare_harm_one_site_violins(
        unharmonized_df,
        harmonized_df,
        feature_list,
        chosen_feature="sex"
):
    """
    Create a violin plot on single site harmonization by features,
    split on a binary feature of choice which defaults to sex.
    """
    for feat in feature_list:
        complete_merg = pd.concat(
            [unharmonized_df, harmonized_df]).reset_index(drop=True)
        complete_merg[feat] = complete_merg[feat].astype('float64')
        sns.set_style("whitegrid")
        y_axis = feat
        g = sns.catplot(
            data=complete_merg,
            x='harmonization', y=y_axis, hue=chosen_feature,
            split=True, inner='quartile', kind='violin',
            height=4, aspect=0.7, palette=['pink', 'blue'], alpha=0.4)

        lowest_on_graph = complete_merg[y_axis].min() - 0.5
        plt.ylim((lowest_on_graph, complete_merg[y_axis].max() * 1.5))
        plt.title(feat)
        plt.show()

In [None]:
compare_harm_one_site_violins(top_pl_unharm_order, top_pl_harm_order, ['gm_vol', 'wm_vol', 'wmh_count', 'wmh_vol'])

In [None]:
# 1. make sure not artifact of density estimate- 
2. make a plot of differences between harmonization, and real, over age, 
3. histograms, and differences
4. compare to a case on different datasets, and see how much harmonization changed


In [None]:
# nw let's just  pretend to show other graphing
har.compare_harm_multi_site_violins(top_pl_unharm_order, top_pl_harm_order, ['gm_vol', 'wm_vol', 'wmh_count', 'wmh_vol'])