In [None]:
! pip install matplotlib plotly

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)
import json
import os
import sys
import pandas as pd
from pathlib import Path
import boto3
from eliot import log_message
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy import text
import pickle
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

## Load config

In [None]:
from shared.constants import LOCAL_TRAINING_CONFIG_PATH
from shared.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
processed_path = Path('/data/processed')
data_path = Path('/data/raw')

CLIENT = "+".join([config.organization_id for config in training_config.ml_model_org_configs])

EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
# starting training from day 31 so that cumsum window 2,7,14,30 are all initial correct.
EXPERIMENT_DATES['train_start_date'] = str((pd.to_datetime(EXPERIMENT_DATES['train_start_date']) +  pd.DateOffset(days=30)).date())

EXPERIMENT_DATES

In [None]:
%%time

# overall_df = pd.read_parquet(processed_path/'final_cleaned_df.parquet')


with open(processed_path/'final-train_x.pickle','rb') as f: train_x = pickle.load(f)
with open(processed_path/'final-train_target_3_day.pickle','rb') as f: train_target_3_day = pickle.load(f)
train_x['hosp_target_3_day_hosp'] = train_target_3_day


with open(processed_path/'final-valid_x.pickle','rb') as f: valid_x = pickle.load(f)
with open(processed_path/'final-valid_target_3_day.pickle','rb') as f: valid_target_3_day = pickle.load(f)       
valid_x['hosp_target_3_day_hosp'] = valid_target_3_day


with open(processed_path/'final-test_x.pickle','rb') as f: test_x = pickle.load(f)
with open(processed_path/'final-test_target_3_day.pickle','rb') as f: test_target_3_day = pickle.load(f)      
test_x['hosp_target_3_day_hosp'] = test_target_3_day

In [None]:
def get_mm_distribution(df, dtype):
    positive = df.query('hosp_target_3_day_hosp == 1').shape[0]
    negative = df.query('hosp_target_3_day_hosp != 1').shape[0]
    
    total = df.shape[0]
    positive_percent = (100 * positive) / total
    negative_percent = (100 * negative) / total
    
    return [total, positive, negative, positive_percent, negative_percent, dtype]


data_list = []
data_list.append(get_mm_distribution(train_x, 'TRAIN'))
data_list.append(get_mm_distribution(valid_x, 'VALID'))
data_list.append(get_mm_distribution(test_x, 'TEST'))


df = pd.DataFrame(
                    columns=["Total", "Positive", "Negative", "Positive%", "Negative%","TYPE"], 
                    data=data_list
                 )


df.head()

In [None]:
def get_stay_length(staylength):
    if staylength > 120:
        return 120
    else:
        return staylength
    
def get_metrics_df(df, data_type):
    # here each index indicates LOS and value indicates the count of transfer for that LOS
    total = [0 for i in range(0,121)] 
    positive = [0 for i in range(0,121)] 
    negative = [0 for i in range(0,121)] 

    for index, row in df.iterrows():
        j = int(get_stay_length(row['admissions_days_since_last_admission']))
        total[j] += 1
        if row['hosp_target_3_day_hosp'] == 1: 
            positive[j] += 1
        elif row['hosp_target_3_day_hosp'] != 1:
            negative[j] += 1
        
    
    # create a dataframe from the above 3 lists
    metric_df = pd.DataFrame ({"ALL": total, "POSITIVE": positive, "NEGATIVE": negative})

    ## percentages at lengthofstay n
    metric_df['positive_percent'] = (metric_df['POSITIVE']/metric_df['ALL']) * 100
    metric_df['negative_percent'] = (metric_df['NEGATIVE']/metric_df['ALL']) * 100
    
    metric_df.columns = [data_type+'_'+col for col in metric_df.columns]
    metric_df = metric_df.fillna(0)
    
    return metric_df



In [None]:
_df1 = get_metrics_df(train_x, 'TRAIN')
_df2 = get_metrics_df(valid_x, 'VALID')
_df3 = get_metrics_df(test_x, 'TEST')

final_df = pd.concat([_df1, _df2, _df3], axis=1)
final_df.head()

## ===============================================================

## Distribution of patients LOS in train / valid / test data

In [None]:
import plotly.express as px
import plotly.offline as pyo
import numpy as np
pyo.init_notebook_mode()

In [None]:
graph_height = 750
data_type = 'Test'
pclass = 'Positive'
selectedClass = 'TEST_POSITIVE'
allClass = 'TEST_ALL'

In [None]:
print('Experiment: avante-3-day-hosp-V6-run1')
print(f'total {data_type} dataset size for {CLIENT}= ', final_df[allClass].sum())
print(f'total {pclass} patient days in {data_type} data = ', final_df[selectedClass].sum())

# Exclude last row ie. LOS=120
_final_df = final_df.drop(final_df.tail(1).index)
# _final_df = final_df

fig = px.bar(
    _final_df, 
    x=[selectedClass], 
    y=list(_final_df.index),
    height=graph_height, 
    orientation='h',
    title=f'LOS Histogram for {pclass} patient days in {data_type} dataset for {CLIENT}',
    labels={
        'y':'Length Of Stay', 
        'caught_rth': 'LOS Count'
    }, 
    color_discrete_sequence=['green']
)
fig['layout']['yaxis']['autorange'] = "reversed"
fig['layout']['xaxis']['title'] = "Count"
fig.show()



In [None]:
graph_height = 750
pclass = 'Negative'
selectedClasses = ["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"]


In [None]:

final_df[["TRAIN_POSITIVE_nor","VALID_POSITIVE_nor","TEST_POSITIVE_nor"]] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    final_df[["TRAIN_POSITIVE","VALID_POSITIVE","TEST_POSITIVE"]]
)

final_df[["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"]] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    final_df[["TRAIN_NEGATIVE","VALID_NEGATIVE","TEST_NEGATIVE"]]
)
final_df.head()    

In [None]:
_final_df = final_df
_final_df = final_df.drop(final_df.tail(1).index)

fig = px.line(
    _final_df, 
    y=selectedClasses, 
    x=list(_final_df.index), 
    labels={
        'x':'Length Of Stay', 
    }, 
    height=750, 
    title=f'LOS Histogram for Normalised {pclass} patient days across Train, Valid & Test dataset for {CLIENT}',
)
fig['layout']['yaxis']['title'] = f"{pclass} Patient day Normalised value between 0 to 100"
# fig['layout']['yaxis']['tickformat'] = ',.0%'
# fig['layout']['yaxis']['range'] = [0,1]


fig.show()

## ======================= MATPLOT ===============================

In [None]:
_df = train_x.query('hosp_target_3_day_hosp == 1')
print('total RTH in train data = ', _df.shape[0])
fig,ax = plt.subplots(1,1)
ax.hist(_df['admissions_days_since_last_admission'], bins = [0,30,60,100,150, 200])
ax.set_title("histogram for Train data")
ax.set_xticks([0,30,60,100,150, 200])
plt.show()

## ===============================================================