In [1]:
import pandas as pd
import numpy as np
import gc
import sys
from pathlib import Path

from saiva.model.shared.utils import get_client_class, get_memory_usage
from saiva.model.shared.constants import CLIENT
import seaborn as sns
from scipy import stats as ss
import matplotlib.pyplot as plt
import timeit
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', -1)

  from ipykernel import kernelapp as app


In [2]:
processed_path = Path('/data/processed')
total_df = pd.read_parquet(processed_path/'02-result.parquet')

In [3]:
print(total_df.shape)
df = total_df.sample(frac=.5)
print(df.shape)
df['hosp_target_3_day_hosp'] = df.hosp_target_3_day_hosp.fillna(False)
df['hosp_target_7_day_hosp'] = df.hosp_target_7_day_hosp.fillna(False)

(1096254, 4097)
(548127, 4097)


## ================== Types of columns ===================

In [4]:
cumsum_cols = [col for col in df.columns if col.startswith('cumsum')]
dx_cols = [col for col in cumsum_cols if '_dx_' in col]
med_cols = [col for col in cumsum_cols if '_med_' in col]
order_cols = [col for col in cumsum_cols if '_order_' in col]
alert_cols = [col for col in cumsum_cols if '_alert_' in col]
labs_cols = [col for col in cumsum_cols if '_labs__' in col]

print('cumsum columns',len(cumsum_cols))
print('dx columns',len(dx_cols))
print('med columns',len(med_cols))
print('order columns',len(order_cols))
print('alerts columns',len(alert_cols))
print('lab columns',len(labs_cols))

na_indictator_cols = [col for col in df.columns if col.startswith('na_indictator_')]
print('na_indictator columns',len(na_indictator_cols))
diff_cols = [col for col in df.columns if col.startswith('diff_')]
print('vitals diff columns',len(diff_cols))
rol_cols = [col for col in df.columns if col.startswith('rol')]
print('vitals rol columns',len(rol_cols))
vtl_cols = [col for col in df.columns if col.startswith('vtl_')]
print('vitals columns',len(vtl_cols))
demo_cols = [col for col in df.columns if col.startswith('demo_')]
print('demo columns',len(demo_cols))
date_cols = [col for col in df.columns if col.startswith('censusdate_') or col.startswith('dateofbirth_')]
print('date columns',len(date_cols))
hosp_cols = ['hosp_count_prior_hosp', 'hosp_days_since_last_hosp']
print('hosp columns',len(hosp_cols))
identity_cols = ['censusdate', 'masterpatientid', 'facilityid', 'bedid', 'client']
target_cols = [col for col in df.columns if 'target' in col]
print('identity columns',len(identity_cols))
print('target columns',len(target_cols))
target_cols = ['hosp_target_3_day_hosp']

cumsum columns 2868
dx columns 381
med columns 1413
order columns 378
alerts columns 15
lab columns 681
na_indictator columns 990
vitals diff columns 44
vitals rol columns 44
vitals columns 22
demo columns 99
date columns 22
hosp columns 2
identity columns 5
target columns 2


In [6]:
cols = cumsum_cols + target_cols + identity_cols + demo_cols + rol_cols + diff_cols + vtl_cols + na_indictator_cols + hosp_cols + date_cols
# get remaining columns
remaining_df = df[df.columns.difference(cols)]
remaining_df.columns

Index(['hosp_target_7_day_hosp'], dtype='object')

## =================== Correlation ===================

In [5]:
def correlation(dataset, threshold):
    lst =[]
    corr_matrix = dataset.corr()
    column_length = len(corr_matrix.columns)
    for i in range(column_length):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold: # we are interested in absolute coeff value
                colname1 = corr_matrix.columns[i]
                colname2 = corr_matrix.columns[j]
                lst.extend([[colname1,colname2, corr_matrix.iloc[i, j]]])
    temp_df = pd.DataFrame(lst,columns=['column1','column2','correlation'])
    temp_df.sort_values('correlation',ascending=False,inplace=True)
                
    return temp_df, corr_matrix 

def target_correlation(dataset, target_cols, threshold):
    lst =[]
    corr_matrix = dataset.corr()
    column_length = len(corr_matrix.columns)
    for i in range(column_length):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= threshold:
                colname1 = corr_matrix.columns[i]
                colname2 = corr_matrix.columns[j]
                if colname1 in target_cols or colname2 in target_cols:
                    lst.extend([[colname1,colname2, corr_matrix.iloc[i, j]]])
    temp_df = pd.DataFrame(lst,columns=['column1','column2','correlation'])
    temp_df.sort_values('correlation',ascending=False,inplace=True)
                
    return temp_df, corr_matrix 

In [9]:
_df = df[date_cols]
df_corr,corr_matrix = correlation(_df,.8)
df_corr

# more analysis on the frequency of data - vitals
# 30 day window & all time window
# Na filling 
# hyper opt training
# parallel processing 

Unnamed: 0,column1,column2,correlation
0,censusdate_Week,censusdate_Month,0.977162
1,dateofbirth_Week,dateofbirth_Month,0.962279


In [None]:
df_corr.to_csv('cr_date.csv', header=True)

In [59]:
_df = df[date_cols + target_cols]
df_corr,corr_matrix = target_correlation(_df,target_cols, 0.5)
df_corr

Unnamed: 0,column1,column2,correlation


## =================== Correlation Heat Map ===================

In [None]:
f, ax = plt.subplots(figsize=(11, 15)) 
heatmap = sns.heatmap(corr_matrix, 
                      square = True,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .4, 
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      vmin = -1, 
                      vmax = 1,
                      annot = True,
                      annot_kws = {"size": 12})
#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)
sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

## =================== Categorical Correlation ===================

In [None]:
def cramers_v(col1, col2):
    confusion_matrix = pd.crosstab(col1,col2)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))


cat_cols = demo_cols
lst =[]
for i in range(len(cat_cols)):
    for j in range(len(cat_cols)):
        if i != j:
            colname1 = cat_cols[i]
            colname2 = cat_cols[j]
            corr = cramers_v(df[colname1], df[colname2])
            lst.extend([[colname1,colname2,corr]])
temp_df = pd.DataFrame(lst,columns=['column1','column2','correlation'])
temp_df.sort_values('correlation',ascending=False,inplace=True)
temp_df[temp_df['correlation'] > 0.5]

## ========== Mutual Information ==================

In [6]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split

# calculate the mutual information between the variables and the target
# this returns the mutual information value of each feature.
# the smaller the value the less information the feature has about the target

X_train = df[diff_cols]
y_train = df[target_cols]
mi = mutual_info_classif(X_train.fillna(0), y_train)
mi

  y = column_or_1d(y, warn=True)


array([6.29390013e-04, 4.01312087e-04, 2.97718926e-05, 1.24987139e-03,
       2.57054523e-05, 8.94196900e-04, 1.11550361e-03, 4.65247559e-04,
       2.08214724e-04, 3.25365184e-04, 5.57329018e-04, 0.00000000e+00,
       3.56419500e-04, 3.81318941e-04, 2.50198797e-04, 5.46098608e-04,
       4.89236811e-04, 2.07178553e-04, 1.20349541e-03, 3.26883026e-04,
       3.05834570e-04, 0.00000000e+00, 7.06512385e-04, 8.21554014e-04,
       1.24135693e-04, 1.61377511e-03, 4.21487853e-04, 1.22482124e-03,
       1.71350643e-03, 6.91066961e-04, 7.31633365e-04, 4.38595262e-04,
       6.82785919e-04, 3.50232967e-04, 7.95134028e-04, 4.52938125e-04,
       6.70349997e-04, 7.76446491e-04, 2.76156803e-04, 2.16806924e-04,
       1.31087379e-03, 4.60974199e-04, 3.45691144e-04, 2.08660022e-04])

In [7]:
# let's add the variable names and order the features
# according to the MI for clearer visualisation
mi = pd.Series(mi)
mi.index = X_train.columns
mi = mi.sort_values(ascending=False)
mi.to_csv('mi-vitals-diff2.csv', header=True)

## =============== Play ==================

In [8]:
processed_path = Path('/data/processed')
combined = pd.read_parquet(processed_path/'01-result.parquet')

In [17]:
vtl_cols = [col for col in combined.columns if col.startswith('vtl')]

ffilled = combined.groupby('masterpatientid')[vtl_cols].fillna(method='ffill')
ffilled['masterpatientid'] = df.masterpatientid
vtl_cols = [col for col in vtl_cols if '_min_' not in col and '_max_' not in col]
vtl_cols

['vtl_median_BP - Systolic',
 'vtl_median_Blood Sugar',
 'vtl_median_Height',
 'vtl_median_O2 sats',
 'vtl_median_Pain Level',
 'vtl_median_Pulse',
 'vtl_median_Respiration',
 'vtl_median_Temperature',
 'vtl_median_Weight',
 'vtl_std_BP - Systolic',
 'vtl_std_Blood Sugar',
 'vtl_std_Height',
 'vtl_std_O2 sats',
 'vtl_std_Pain Level',
 'vtl_std_Pulse',
 'vtl_std_Respiration',
 'vtl_std_Temperature',
 'vtl_std_Weight',
 'vtl_median_diastolicvalue',
 'vtl_std_diastolicvalue',
 'vtl_bmi']

## ============== check why labs features not coming up =============

In [None]:
master_patient_lookup = result_dict['master_patient_lookup'].copy(deep=True)
census = result_dict['patient_census'].copy(deep=True)

base = census.merge(
    master_patient_lookup,
    how='inner',
    left_on=['masterpatientid'],
    right_on=['masterpatientid']
)
base.shape



labs_df = result_dict['patient_lab_results'].copy(deep=True)

labs_df['resultdate'] = labs_df['resultdate'].dt.normalize()
base['censusdate'] = base['censusdate'].dt.normalize()

merged_df = base.merge(
        labs_df,
        how='inner',
        left_on=['masterpatientid', 'censusdate'],
        right_on=['masterpatientid', 'resultdate']
    )
print(labs_df.shape)
print(merged_df.shape)


# Labs starting and ending date
labs_df = result_dict['patient_lab_results'].copy(deep=True)
labs_df['resultdate'] = labs_df['resultdate'].dt.normalize()
for fid in list(labs_df.facilityid.unique()):
    print(f"{fid}  - {labs_df[labs_df['facilityid'] == fid]['resultdate'].min()} - {labs_df[labs_df['facilityid'] == fid]['resultdate'].max()}")