# Various experimental visualizations

## How at baseline do datasets differ, 

We will use different datasets now for convenience; must be run in graphing_env environment

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interactive
import seaborn as sns
from ipywidgets import Layout, Button, Box, FloatText, Textarea, Dropdown, Label, IntSlider

sys.path.insert(0, '../../') # path to functions
import cvasl.harmony as har

In [None]:
# Datasets for this work
EDIS_path = '../our_datasets/EDIS/'
HELIUS_path = '../our_datasets/HELIUS/'
Insight46_path = '../our_datasets/Insight46/'
SABRE_path = '../our_datasets/SABRE/'
MRI_path = '../our_datasets/StrokeMRI/'
TOP_path = '../our_datasets/TOP/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
EDIS_file = os.path.join(EDIS_path, file_name)
HELIUS_file = os.path.join(HELIUS_path, file_name)
Insight46_file = os.path.join(Insight46_path, file_name)
SABRE_file = os.path.join(SABRE_path, file_name)

EDIS_n = pd.read_csv(EDIS_file, index_col=0)
HELIUS_n = pd.read_csv(HELIUS_file, index_col=0)
Insight46_n = pd.read_csv(Insight46_file, index_col=0)
SABRE_n = pd.read_csv(SABRE_file, index_col=0)
TOP_n = pd.read_csv(TOP_file, index_col=0)
MRI_n = pd.read_csv(MRI_file, index_col=0)

In [None]:
HELIUS = HELIUS_n.drop(['ID', 'Site'], axis=1)
Insight46 = Insight46_n.drop(['ID', 'Site'], axis=1)
EDIS = EDIS_n.drop(['ID', 'Site'], axis=1)
SABRE = SABRE_n.drop(['ID', 'Site'], axis=1)
StrokeMRI = MRI_n.drop(['ID', 'Site'], axis=1)
TOP = TOP_n.drop(['ID', 'Site'], axis=1)
#TOP.head(2)

In [None]:
datasets_names = ['EDIS', 'SABRE', 'Insight46', 'TOP', 'StrokeMRI', 'HELIUS','TOPMRI']

In [None]:
TOPMRI= pd.concat([TOP, StrokeMRI], sort=False)

In [None]:
TOP.columns = TOP.columns.str.lower() 
TOPMRI.columns = TOPMRI.columns.str.lower()
StrokeMRI.columns = StrokeMRI.columns.str.lower() 
Insight46.columns = Insight46.columns.str.lower() 
EDIS.columns = EDIS.columns.str.lower() 
SABRE.columns= SABRE.columns.str.lower() 
HELIUS.columns= HELIUS.columns.str.lower() 

In [None]:
features = EDIS.columns
features =list(features)

In [None]:
datasets = [EDIS, SABRE, Insight46, TOP, StrokeMRI, HELIUS, TOPMRI]
for qset in datasets:
    print(qset.sex.unique())

In [None]:
sex_mapping = {1:0,2:1}
Insight46 = Insight46.assign(sex = Insight46.sex.map(sex_mapping))
EDIS =EDIS.assign(sex = EDIS.sex.map(sex_mapping))
SABRE=SABRE.assign(sex = SABRE.sex.map(sex_mapping))

In [None]:
# check and rename known repeater in HELIUS and SABRE
sabres = set(SABRE.participant_id)
heliar = set(HELIUS.participant_id)
x = sabres.intersection(heliar)
print(x)

In [None]:
# replace one repeating patient number to seperate it!
HELIUS.loc[HELIUS['participant_id']=='sub-153852_1', 'participant_id'] = 'sub-153852_1H'

In [None]:
dataset_dictionary= {
    'SABRE':SABRE,
    'EDIS':EDIS,
    'TOP':TOP,
    'HELIUS':HELIUS,
    'StrokeMRI':StrokeMRI,
    'Insight46':Insight46,
    'TOPMRI': TOPMRI
    
}

In [None]:
# made sure no patients repeat now!
sabres = set(SABRE.participant_id)
heliar = set(HELIUS.participant_id)
x = sabres.intersection(heliar)
print(x)

## Reccomended plotting about excluded data

### plotting widgets for any two datasets

In [None]:
Dataset1 = widgets.Select(
    options=datasets_names,
    value='EDIS',
    description='Dataset 1:',
    disabled=False
)
Dataset2 = widgets.Select(
    options=datasets_names,
    value='SABRE',
    description='Dataset 2:',
    disabled=False
)
Feature1 = widgets.Select(
    options=features,
    value='age',
    description='Feature 1:',
    disabled=False
)
Feature2 = widgets.Select(
    options=features,
    value='gm_vol',
    description='Feature 2:',
    disabled=False
)
label_dataset1 = widgets.Textarea(
    value='EDIS',
    placeholder='Type something',
    description='label dataset1:',
    disabled=False
)
label_dataset2 = widgets.Textarea(
    value='SABRE',
    placeholder='Type something',
    description='String:',
    disabled=False
)
box2 = Box(children=[label_dataset1, label_dataset2])

box = Box(children=[Dataset1, Dataset2, Feature1, Feature2])
box

In [None]:
box2

In [None]:
# can not do it on more than one (it on three) and add linear regressions and hues! too bad
har.show_diff_on_var(
    dataset_dictionary[Dataset1.value],
    label_dataset1.value,
    dataset_dictionary[Dataset2.value],
    label_dataset2.value,
    Feature1.value,
    Feature2.value)

In [None]:
sns.regplot(x=dataset_dictionary[Dataset1.value][Feature1.value], y=dataset_dictionary[Dataset1.value][Feature2.value], scatter_kws={'alpha':0.2})
sns.regplot(x=dataset_dictionary[Dataset2.value][Feature1.value], y=dataset_dictionary[Dataset2.value][Feature2.value], scatter_kws={'alpha':0.2})

In [None]:
# # can not do it on more than one (it on three) and add linear regressions and hues! too bad
# har.show_diff_on_var3(EDIS,
#     'EDIS',
#     TOPMRI,
#     'TOPMRI',              
#     SABRE,
#     'SABRE',
#     'age',
#     'gm_vol',
# )

# Reccomended plotting about demographics of all datasets

In [None]:
# here we will pick the features for a joinplot of all datasets
Feature1 = widgets.Select(
    options=features,
    value='age',
    description='Feature 1:',
    disabled=False
)
Feature2 = widgets.Select(
    options=features,
    value='gm_vol',
    description='Feature 2:',
    disabled=False
)
box = Box(children=[Feature1, Feature2])
box

In [None]:
# add a drop down box for the feature
har.show_diff_on_var5(EDIS,
    'EDIS',
    TOPMRI,
    'TOPMRI',   
    HELIUS,
    'HELIUS',
    Insight46,
    'Insight46', 
    SABRE,
    'SABRE',
    Feature1.value,
    Feature2.value,
)

In [None]:
sns.regplot(x=HELIUS[Feature1.value], y=HELIUS[Feature2.value], scatter_kws={'alpha':0.1})
sns.regplot(x=EDIS[Feature1.value], y=EDIS[Feature2.value], scatter_kws={'alpha':0.1})
sns.regplot(x=SABRE[Feature1.value], y=SABRE[Feature2.value], scatter_kws={'alpha':0.1})
sns.regplot(x=TOPMRI[Feature1.value], y=TOPMRI[Feature2.value], scatter_kws={'alpha':0.1})
sns.regplot(x=Insight46[Feature1.value], y=Insight46[Feature2.value], scatter_kws={'alpha':0.1})


OK, now we need some graphs about harmonization...let's build on what we have

In [None]:
neuro_harm_HELIUS= pd.read_csv('../../extended_harm_paper/harmonizations/harm_results/5neuro_harm_HELIUS.csv', index_col=0)
neuro_harm_topmri= pd.read_csv('../../extended_harm_paper/harmonizations/harm_results/5neuro_harm_topmri.csv', index_col=0)
neuro_harm_INSI  = pd.read_csv('../../extended_harm_paper/harmonizations/harm_results/5neuro_harm_INSI.csv', index_col=0)
neuro_harm_SABRE = pd.read_csv('../../extended_harm_paper/harmonizations/harm_results/5neuro_harm_SABRE.csv', index_col=0)
neuro_harm_EDIS  = pd.read_csv('../../extended_harm_paper/harmonizations/harm_results/5neuro_harm_EDIS.csv', index_col=0)

In [None]:
neuro_harm_HELIUS.columns = neuro_harm_HELIUS.columns.str.lower()
neuro_harm_topmri.columns= neuro_harm_topmri.columns.str.lower()
neuro_harm_INSI.columns= neuro_harm_INSI.columns.str.lower()
neuro_harm_SABRE.columns= neuro_harm_SABRE.columns.str.lower()
neuro_harm_EDIS.columns= neuro_harm_EDIS.columns.str.lower()

In [None]:
def general_compare_harm_one_site_violins(
        unharmonized_df,
        harmonized_df,
        feature_list,
        chosen_feature="sex"
):
    """
    Create a violin plot on single site harmonization by features,
    split on a binary feature of choice which defaults to sex.
    """
    for feat in feature_list:
        unharmonized_df['harmonization']= 'No'
        harmonized_df['harmonization']= 'Yes'
        complete_merg = pd.concat(
            [unharmonized_df, harmonized_df]).reset_index(drop=True)
        complete_merg[feat] = complete_merg[feat].astype('float64')
        sns.set_style("whitegrid")
        y_axis = feat
        g = sns.catplot(
            data=complete_merg,
            x='harmonization', y=y_axis, hue=chosen_feature,
            split=True, inner='quartile', kind='violin',
            height=5, aspect=0.6, palette=['pink', 'blue'], alpha=0.4)

        lowest_on_graph = complete_merg[y_axis].min() - 0.5
        plt.ylim((lowest_on_graph, complete_merg[y_axis].max() * 1.5))
        plt.title(feat)
        plt.show()

In [None]:
general_compare_harm_one_site_violins(
        HELIUS,
        neuro_harm_HELIUS,
        ['gm_vol'],
        chosen_feature="sex"
)

In [None]:
### There is better code in R. to be added...