## Open this in Jupyter Tree 

### Restart kernel after the below packages are installed 

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)
import json
import os
import sys
import pandas as pd
from pathlib import Path
import boto3
from eliot import log_message
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy import text
import pickle
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from PIL import Image

from saiva.model.shared.constants import MODEL_TYPE
from saiva.model.shared.utils import get_client_class
from saiva.training import load_x_y_idens

## Load config

In [None]:
from saiva.model.shared.constants import saiva_api, LOCAL_TRAINING_CONFIG_PATH
from saiva.training.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
import numpy as np
pyo.init_notebook_mode()

In [None]:
processed_path = Path('/data/processed')
data_path = Path('/data/raw')

CLIENT = "+".join([config.organization_id for config in training_config.organization_configs])

EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
# starting training from day 31 so that cumsum window 2,7,14,30 are all initial correct.
EXPERIMENT_DATES['train_start_date'] = str((pd.to_datetime(EXPERIMENT_DATES['train_start_date']) +  pd.DateOffset(days=30)).date())

EXPERIMENT_DATES

In [None]:
MODEL_TYPE = MODEL_TYPE.lower()
print('MODEL:', MODEL_TYPE)

In [None]:
train_x, train_target_3_day, _ = load_x_y_idens(processed_path, MODEL_TYPE, 'train')
valid_x, valid_target_3_day, _ = load_x_y_idens(processed_path, MODEL_TYPE, 'valid')
test_x, test_target_3_day, _ = load_x_y_idens(processed_path, MODEL_TYPE, 'test')

In [None]:
np_mode = isinstance(train_x, np.ndarray)

In [None]:
if np_mode:
    with open(processed_path/'cate_columns.pickle', 'rb') as f: cate_columns = pickle.load(f)
    with open(processed_path/'feature_names.pickle', 'rb') as f: feature_names = pickle.load(f)
    with open(processed_path/'pandas_categorical.pickle', 'rb') as f: pandas_categorical = pickle.load(f)
    
    train_x = pd.DataFrame(train_x, columns=feature_names)
    valid_x = pd.DataFrame(valid_x, columns=feature_names)
    test_x = pd.DataFrame(test_x, columns=feature_names)

In [None]:
train_x[f'target_3_day_{MODEL_TYPE}'] = train_target_3_day
valid_x[f'target_3_day_{MODEL_TYPE}'] = valid_target_3_day
test_x[f'target_3_day_{MODEL_TYPE}'] = test_target_3_day

In [None]:
def get_facilities_from_data(df=None, categorical_features=None, pandas_categorical=None):
    if not df is None:
        return list(df.facility.unique())
    else:
        return pandas_categorical[categorical_features.index('facility')]

if np_mode:
    facilities = [f.split('_')[1] for f in get_facilities_from_data(
        categorical_features=cate_columns,
        pandas_categorical=pandas_categorical
    )]
else:
    facilities = [f.split('_')[1] for f in get_facilities_from_data(train_x)]

client_df = pd.DataFrame(
                    columns=["Client", CLIENT], 
                    data=[['Facilities',','.join(facilities)],['Facility count',len(facilities)]]
                 )

def get_mm_distribution(df, dtype, start_date, end_date):
    positive = df.query(f'target_3_day_{MODEL_TYPE} == 1').shape[0]
    negative = df.query(f'target_3_day_{MODEL_TYPE} != 1').shape[0]
    
    n2p = round(negative/positive, 2)
    total_patient_days = df.shape[0]
    positive_percent = round((100 * positive) / total_patient_days, 2)
    negative_percent = round((100 * negative) / total_patient_days, 2)
    
    return [total_patient_days, positive, negative, positive_percent, negative_percent, n2p, dtype, start_date, end_date]


data_list = []
data_list.append(get_mm_distribution(train_x, 'TRAIN', EXPERIMENT_DATES['train_start_date'], EXPERIMENT_DATES['train_end_date']))
data_list.append(get_mm_distribution(valid_x, 'VALID', EXPERIMENT_DATES['validation_start_date'], EXPERIMENT_DATES['validation_end_date']))
data_list.append(get_mm_distribution(test_x, 'TEST', EXPERIMENT_DATES['test_start_date'], EXPERIMENT_DATES['test_end_date']))


dist_df = pd.DataFrame(
                    columns=["Patient days", "Positive", "Negative", "Positive%", "Negative%", "N2P Ratio","TYPE","start_date","end_date"], 
                    data=data_list
                 )
dist_df['days'] = (pd.to_datetime(dist_df['end_date']) - pd.to_datetime(dist_df['start_date'])).dt.days


dist_df.head()

client_df.head()

In [None]:
def get_stay_length(staylength):
    if staylength > 120:
        return 120
    else:
        return staylength
    
def get_metrics_df(df, data_type):
    # here each index indicates LOS and value indicates the count of transfer for that LOS
    total = [0 for i in range(0,121)] 
    positive = [0 for i in range(0,121)] 
    negative = [0 for i in range(0,121)] 

    for index, row in df.iterrows():
        if pd.isnull(row['days_since_last_admission']): # very rare situation but still may happen
            continue
        j = int(get_stay_length(row['days_since_last_admission']))
        total[j] += 1
        if row[f'target_3_day_{MODEL_TYPE}'] == 1: 
            positive[j] += 1
        elif row[f'target_3_day_{MODEL_TYPE}'] != 1:
            negative[j] += 1
        
    
    # create a dataframe from the above 3 lists
    metric_df = pd.DataFrame ({"ALL": total, "POSITIVE": positive, "NEGATIVE": negative})

    ## percentages at lengthofstay n
    metric_df['positive_percent'] = ((metric_df['POSITIVE']/metric_df['ALL']) * 100).round(2)
    metric_df['negative_percent'] = ((metric_df['NEGATIVE']/metric_df['ALL']) * 100).round(2)
    
    metric_df.columns = [data_type+'_'+col for col in metric_df.columns]
    metric_df = metric_df.fillna(0)
    
    return metric_df



In [None]:
_df1 = get_metrics_df(train_x[['days_since_last_admission', f'target_3_day_{MODEL_TYPE}']], 'TRAIN')
_df2 = get_metrics_df(valid_x[['days_since_last_admission', f'target_3_day_{MODEL_TYPE}']], 'VALID')
_df3 = get_metrics_df(test_x[['days_since_last_admission', f'target_3_day_{MODEL_TYPE}']], 'TEST')

final_df = pd.concat([_df1, _df2, _df3], axis=1)
final_df.head()

In [None]:

final_df[["TRAIN_POSITIVE_nor","VALID_POSITIVE_nor","TEST_POSITIVE_nor"]] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    final_df[["TRAIN_POSITIVE","VALID_POSITIVE","TEST_POSITIVE"]]
)

final_df[["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"]] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    final_df[["TRAIN_NEGATIVE","VALID_NEGATIVE","TEST_NEGATIVE"]]
)
final_df.head()    

## =========== Generate Graph to be pushed to MLFlow ===============

In [None]:
def get_bar_graph(final_df, pclass, data_type, selectedClass, client, colour):
    _final_df = final_df.drop(final_df.tail(1).index)
    # _final_df = final_df

    fig = px.bar(
        _final_df, 
        y=[selectedClass], 
        x=list(_final_df.index),
        title=f'LOS Histogram for {pclass} patient days in {data_type} dataset for {client}',
        labels={
            'y':'Length Of Stay', 
            'caught_rth': 'LOS Count'
        }, 
        color_discrete_sequence=[colour]
    )
    fig['layout']['xaxis']['title'] = "Count"
    
    return fig

def get_line_graph(final_df, selectedClasses, pclass, client):
    _final_df = final_df.copy()
    _final_df = final_df.drop(final_df.tail(1).index)

    fig = px.line(
        _final_df, 
        y=selectedClasses, 
        x=list(_final_df.index), 
        labels={
            'x':'Length Of Stay', 
        }, 
        title=f'LOS Histogram for Normalised {pclass} patient days across Train, Valid & Test dataset for {client}',
    )
    fig['layout']['yaxis']['title'] = f"{pclass} Patient day Normalised value between 0 to 100"

    return fig
    

In [None]:
fig = make_subplots(
    rows=2, 
    cols=4, 
    subplot_titles=("POS Train", "POS Valid", "POS Test", "Normalised POS patient days","NEG Train", "NEG Valid", "NEG Test", "Normalised NEG patient days")
)


plot1 = get_bar_graph(final_df, 'Positive', 'Train', 'TRAIN_POSITIVE', CLIENT, 'blue')
plot2 = get_bar_graph(final_df, 'Positive', 'Valid', 'VALID_POSITIVE', CLIENT, 'Red')
plot3 = get_bar_graph(final_df, 'Positive', 'Test', 'TEST_POSITIVE', CLIENT, 'Green')

plot4 = get_bar_graph(final_df, 'Negative', 'Train', 'TRAIN_NEGATIVE', CLIENT, 'blue')
plot5 = get_bar_graph(final_df, 'Negative', 'Valid', 'VALID_NEGATIVE', CLIENT, 'Red')
plot6 = get_bar_graph(final_df, 'Negative', 'Test', 'TEST_NEGATIVE', CLIENT, 'Green')

plot7 = get_line_graph(final_df,["TRAIN_POSITIVE_nor","VALID_POSITIVE_nor","TEST_POSITIVE_nor"] , 'Positive', CLIENT)
plot8 = get_line_graph(final_df,["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"] , 'Negative', CLIENT)

fig.add_trace(
    plot1["data"][0],
    row=1, col=1
)

fig.add_trace(
    plot2["data"][0],
    row=1, col=2
)

fig.add_trace(
    plot3["data"][0],
    row=1, col=3
)

fig.add_trace(
    plot7["data"][0],
    row=1, col=4
)
fig.add_trace(
    plot7["data"][1],
    row=1, col=4
)
fig.add_trace(
    plot7["data"][2],
    row=1, col=4
)

fig.add_trace(
    plot4["data"][0],
    row=2, col=1
)

fig.add_trace(
    plot5["data"][0],
    row=2, col=2
)

fig.add_trace(
    plot6["data"][0],
    row=2, col=3
)

fig.add_trace(
    plot8["data"][0],
    row=2, col=4
)
fig.add_trace(
    plot8["data"][1],
    row=2, col=4
)
fig.add_trace(
    plot8["data"][2],
    row=2, col=4
)

fig.update_layout(height=900, 
                  width=1024, 
                  title_text=f"LOS Histogram for patient days for {CLIENT}")

pio.write_image(fig, 'distribution_plot.png')


In [None]:
# Distribution table 

layout = go.Layout(
    autosize=False,
    width=1000,
    height=300
)

layout2 = go.Layout(
    autosize=False,
    width=1000,
    height=300,
    title = "* We discard first 30 days of training data to get correct cumsum 2/7/14/30 days calculations",
)


fig1 = go.Figure(data=[go.Table(
    header=dict(values=list(dist_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=dist_df.transpose().values.tolist(),
               fill_color='lavender',
               align='left'))
], layout=layout2)

fig2 = go.Figure(data=[go.Table(
    header=dict(values=list(client_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=client_df.transpose().values.tolist(),
               fill_color='lavender',
               align='left'))
], layout=layout)


pio.write_image(fig1, 'distribution_table.png')
pio.write_image(fig2, 'client_table.png')


# fig1.show()
# fig2.show()


In [None]:
# Merge the 2 images generated from above cells

image0 = Image.open('./client_table.png','r')
image1 = Image.open('./distribution_table.png','r')
image2 = Image.open('./distribution_plot.png','r')

image0 = image0.resize((image2.width, image0.height))
image1 = image1.resize((image2.width, image1.height))
dst = Image.new('RGB', (image1.width, image0.height + image1.height + image2.height), (250,250,250))
dst.paste(image0, (0, 0))
dst.paste(image1, (0, image0.height))
dst.paste(image2, (0, image1.height+image0.height))
dst.save(f"distribution_{CLIENT}.png","PNG")
dst.show()

os.remove("distribution_plot.png") 
os.remove("client_table.png") 
os.remove("distribution_table.png") 


## ===========================END======================================

## Distribution of patients LOS in train / valid / test data

In [None]:
graph_height = 750
CLIENT = 'Vintage'
data_type = 'Test'  # Test, Valid, Train
pclass = 'Positive' # Negative, Positive
selectedClass = 'TEST_POSITIVE' # TEST_NEGATIVE, TEST_POSITIVE, TRAIN_POSITIVE, TRAIN_NEGATIVE, VALID_POSITIVE, VALID_NEGATIVE
allClass = 'TEST_ALL'  # VALID_ALL, TEST_ALL, TRAIN_ALL


In [None]:
print('Experiment: Vintage-3-day-hosp-V6-hp-base-vintage')
print(f'total {data_type} dataset size for {CLIENT}= ', final_df[allClass].sum())
print(f'total {pclass} patient days in {data_type} data = ', final_df[selectedClass].sum())

# Exclude last row ie. LOS=120
_final_df = final_df.drop(final_df.tail(1).index)
# _final_df = final_df

fig = px.bar(
    _final_df, 
    x=[selectedClass], 
    y=list(_final_df.index),
    height=graph_height, 
    orientation='h',
    title=f'LOS Histogram for {pclass} patient days in {data_type} dataset for {CLIENT}',
    labels={
        'y':'Length Of Stay', 
        'caught_rth': 'LOS Count'
    }, 
    color_discrete_sequence=['green']
)
fig['layout']['yaxis']['autorange'] = "reversed"
fig['layout']['xaxis']['title'] = "Count"
fig.show()



## ================ LOS Histogram for Normalised =================

In [None]:
graph_height = 750
CLIENT = 'Vintage'
pclass = 'Negative' # Negative, Positive
selectedClasses = ["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"]
    # ["TRAIN_NEGATIVE_nor","VALID_NEGATIVE_nor","TEST_NEGATIVE_nor"]
    # ["TRAIN_POSITIVE_nor","VALID_POSITIVE_nor","TEST_POSITIVE_nor"]

In [None]:
_final_df = final_df
_final_df = final_df.drop(final_df.tail(1).index)

fig = px.line(
    _final_df, 
    y=selectedClasses, 
    x=list(_final_df.index), 
    labels={
        'x':'Length Of Stay', 
    }, 
    height=750, 
    title=f'LOS Histogram for Normalised {pclass} patient days across Train, Valid & Test dataset for {CLIENT}',
)
fig['layout']['yaxis']['title'] = f"{pclass} Patient day Normalised value between 0 to 100"
# fig['layout']['yaxis']['tickformat'] = ',.0%'
# fig['layout']['yaxis']['range'] = [0,1]


fig.show()