# This notebook examines the cleanded TOP and MRI datasets and makes alternative distributions

Note this must be run in the normal 'mrilanding' environment

## import libraries and data

In [None]:
import os 
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
filepath_mri = '../open_work/internal_results/cleaned_pvc2s/' 
filename_mri = os.path.join(filepath_mri,'StrokeMRI_pvc2c.csv') 
stroke_mri = pd.read_csv(filename_mri )
filepath_top = '../open_work/internal_results/cleaned_pvc2s/' 
filename_top = os.path.join(filepath_top,'TOP_pvc2c.csv') 
top = pd.read_csv(filename_top)

## Let's see the distributions

In [None]:
top.describe()

In [None]:
stroke_mri.describe()

In [None]:
numeric_columns = [
    'age',
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b', 
    'aca_b', 
    'mca_b', 
    'pca_b', 
    'totalgm_b', 
]

In [None]:
# does anything have a negative number or zero in stroke mri?
(stroke_mri[numeric_columns] < 0).sum().sum() + (stroke_mri[numeric_columns] == 0).sum().sum()

In [None]:
# does anything have a negative number or zero in top?
(top[numeric_columns] < 0).sum().sum() + (top[numeric_columns] == 0).sum().sum()

In [None]:
fig, axis = plt.subplots(len(numeric_columns), 1, figsize=(7,15))
fig.tight_layout()
fig.suptitle('TOP distributions ', fontsize=20)
top[numeric_columns].hist(ax = axis, color='orange', alpha=0.8,)

In [None]:
fig, axis = plt.subplots(len(numeric_columns), 1, figsize=(7,15))
fig.tight_layout()
fig.suptitle('StrokeMRI distributions ', fontsize=20)
stroke_mri[numeric_columns].hist(ax = axis, alpha=0.8)

In [None]:
fig, axis = plt.subplots(len(numeric_columns), 1, figsize=(7,15))
fig.tight_layout()
fig.suptitle('both distributions ', fontsize=20)
top[numeric_columns].hist(ax = axis, color='orange', alpha=0.6,)
stroke_mri[numeric_columns].hist(ax = axis, color='blue', alpha=0.6,)

In [None]:
#top['deepwm_b'].describe()

In [None]:
#stroke_mri['deepwm_b'].describe()

In [None]:
stroke_mri['wmh_count'].describe()

In [None]:
top['wmh_count'].describe()

In [None]:
top['wmh_vol'].hist()

In [None]:
stroke_mri['wmh_vol'].hist()

In [None]:
stroke_mri['log_base2_wmh_vol'] = np.log2(stroke_mri['wmh_vol'])
#data['logarithm_base2'] = np.log2(data['Salary'])
stroke_mri['log_base2_wmh_vol'].hist()

In [None]:
stroke_mri['log_base10_wmh_vol'] = np.log10(stroke_mri['wmh_vol'])
stroke_mri['log_base10_wmh_vol'].hist(color='orange', alpha=0.6, )
stroke_mri['log_base2_wmh_vol'].hist(color='blue', alpha=0.6, )

So a log base 10 should better distribute out WMH, but will make some values negative,

but what about in the case of why_count, where one was normally distributed and the otehr not?

In [None]:
stroke_mri['log_base10_wmh_count'] = np.log10(stroke_mri['wmh_count'])
top['log_base10_wmh_count'] = np.log10(top['wmh_count'])
stroke_mri['log_base10_wmh_count'].hist(color='orange', alpha=0.6, )
top['log_base10_wmh_count'].hist(color='blue', alpha=0.6, )

Still looks OK. Let us try using the log base 10 for both, and see if it improves harmonized predictions

In [None]:
stroke_mri['wmh_vol'].describe()

In [None]:
top['wmh_vol'].describe()

### luckily for us the counts on wmh_vol are never absolutely zero, and the same for count...no worries about log zero

In [None]:
top['log_base10_wmh_vol'] = np.log10(top['wmh_vol'])
stroke_mri['wmh_count'] = stroke_mri['log_base10_wmh_count']
stroke_mri['wmh_vol'] = stroke_mri['log_base10_wmh_vol']
top['wmh_count'] = top['log_base10_wmh_count']
top['wmh_vol'] = top['log_base10_wmh_vol']

In [None]:
list_of_parameters = [
    'participant_id', 
    'age',
    'sex',
    'gm_vol',
    'wm_vol',
    'csf_vol',
    'gm_ivc_ratio',
    'gmwm_ivc_ratio',
    'wmh_vol',
    'wmh_count',
    'deepwm_b_cov',
    'aca_b_cov',
    'mca_b_cov',
    'pca_b_cov',
    'totalgm_b_cov',
    'deepwm_b', 
    'aca_b', # we 
    'mca_b', # we 
    'pca_b', # we 
    'totalgm_b', 
]
top_loged = top[list_of_parameters]
stroke_mri_loged = stroke_mri[list_of_parameters]

In [None]:
# check to make sure no log 0 is recoded to a Nan or something bizzare
print('top Nan numbers')
print(top_loged.isna().sum().sum())
print('strokeMRI nan numbers')
print(stroke_mri_loged.isna().sum().sum())


In [None]:
# save off log based datasets

top_loged.to_csv('top_loged_mon.csv')
stroke_mri_loged.to_csv('stroke_loged_mon.csv')

In [None]:
top_loged.describe()

In [None]:
stroke_mri_loged.describe()