In [1]:
!pip install matplotlib seaborn

Collecting matplotlib
  Downloading matplotlib-3.4.2-cp38-cp38-manylinux1_x86_64.whl (10.3 MB)
[K     |████████████████████████████████| 10.3 MB 12.7 MB/s eta 0:00:01
[?25hCollecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 63.6 MB/s eta 0:00:01
[?25hCollecting pillow>=6.2.0
  Downloading Pillow-8.2.0-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 61.9 MB/s eta 0:00:01
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 62.2 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: pillow, kiwisolver, cycler, matplotlib, seaborn
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.4.2 pillow-8.2.0 seaborn-0.11.1


In [None]:
!jupyter labextension install @jupyterlab/plotly-extension

In [3]:
!pip install plotly --quiet

In [None]:
import pandas as pd
import numpy as np
import gc
import sys
from pathlib import Path
sys.path.insert(0, '/src')

from utils.database import DbEngine
from utils.load_data import DataLoader, PatientCensus
from datetime import timedelta
import timeit
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
import plotly.express as px
import plotly.figure_factory as ff

from pylab import *
from sklearn.cluster import KMeans
from scipy.stats import gaussian_kde
from sklearn.neighbors import KernelDensity

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
import os

CLIENT = 'trio'
START_DATE, END_DATE = pd.to_datetime('2020-01-01'), pd.to_datetime('2021-05-18')

print(CLIENT)
print(os.environ.get('SAIVA_ENV','dev'))
print(START_DATE, END_DATE)

In [None]:
engine = DbEngine()
saiva_engine = engine.get_postgresdb_engine()
client_sql_engine = engine.get_sqldb_engine(clientdb_name=CLIENT)

In [None]:
# verify connectivity
engine.verify_connectivity(client_sql_engine)

### Load patient data

In [None]:
census = PatientCensus(
    client_sql_engine=client_sql_engine, 
    start_date=START_DATE, 
    end_date=END_DATE
)
df = census.get_patient_census()
masterpatient_ids = df['masterpatientid']

In [None]:
data = DataLoader(
    client_sql_engine=client_sql_engine, 
    masterpatientid_list=masterpatient_ids, 
    census_date=END_DATE
)

alerts_df = data.load_alerts()
diagnosis_df = data.load_diagnosis()


### Calculate Alerts count

In [None]:
%%time 

_alerts_df = alerts_df.merge(df, on=['masterpatientid'])

# Filter for alerts less than census_date & greater than last 3 days from census_date
condition = (_alerts_df.createddate < _alerts_df.censusdate) & (
        _alerts_df.createddate >= (_alerts_df.censusdate - timedelta(days=3)))

last_alerts_df = _alerts_df[condition]

# Do Cumulative summation and pick the last row to get the overall alerts count for last 3 days
last_alerts_df['alerts_count'] = last_alerts_df.groupby(
    ['masterpatientid', 'censusdate']).createddate.cumcount() + 1
last_alerts_df.drop_duplicates(subset=['masterpatientid', 'censusdate'], keep='last', inplace=True)

last_alerts_df = last_alerts_df[['masterpatientid', 'censusdate','alerts_count']]

final_df = df.merge(
            last_alerts_df,
            how='left',
            left_on=['masterpatientid', 'censusdate'],
            right_on=['masterpatientid', 'censusdate']
        )
final_df['alerts_count'].fillna(0, inplace=True)

print(final_df.shape)
final_df.head(3)

### Calculate Diagnosis count

In [None]:
%%time 

_diagnosis_df = diagnosis_df.merge(df, on=['masterpatientid'])

# Filter for diagnosis less than census_date & greater than last 7 days from census_date
condition = (_diagnosis_df.onsetdate < _diagnosis_df.censusdate) & (
        _diagnosis_df.onsetdate >= (_diagnosis_df.censusdate - timedelta(days=7)))

last_diagnosis_df = _diagnosis_df[condition]

# Do Cumulative summation and pick the last row to get the overall alerts count for last 7 days
last_diagnosis_df['diagnosis_count'] = last_diagnosis_df.groupby(
    ['masterpatientid', 'censusdate']).onsetdate.cumcount() + 1
last_diagnosis_df.drop_duplicates(subset=['masterpatientid', 'censusdate'], keep='last', inplace=True)

last_diagnosis_df = last_diagnosis_df[['masterpatientid', 'censusdate','diagnosis_count']]

final_df = final_df.merge(
            last_diagnosis_df,
            how='left',
            left_on=['masterpatientid', 'censusdate'],
            right_on=['masterpatientid', 'censusdate']
        )
final_df['diagnosis_count'].fillna(0, inplace=True)

print(final_df.shape)
final_df.head(3)

In [None]:
final_df[final_df['diagnosis_count'] > 39].head(5)

In [None]:
# verify whether the alerts_count is correct 

alerts_df[(alerts_df['masterpatientid'] == 602992) & (alerts_df['createddate'] <= '2021-01-02') & (alerts_df['createddate'] > '2020-12-25')]



In [None]:
# verify whether the diagnosis_count is correct 

diagnosis_df[(diagnosis_df['masterpatientid'] == 414983) & (diagnosis_df['onsetdate'] <= '2021-03-03') & (diagnosis_df['onsetdate'] > '2021-02-27')]



In [None]:
final_df.to_parquet('final_df.parquet')

In [None]:
final_df = pd.read_parquet('final_df.parquet')
final_df.shape

In [None]:
final_df.sort_values(by=['masterpatientid', 'censusdate'], inplace=True)

In [None]:
X = final_df[['alerts_count','diagnosis_count']]
Y = final_df['transfered']

In [None]:
transfered_x = final_df[final_df['transfered'] == 1][['diagnosis_count','alerts_count']]
safe_x = final_df[final_df['transfered'] == 0][['diagnosis_count','alerts_count']]

In [None]:
transfered_x.describe()

In [None]:
safe_x.describe()

### +++++++++++++++++++++++++++++ Plot graphs ++++++++++++++++++++++++++++++

### Distribution & density

In [None]:
x = transfered_x['diagnosis_count']

sns.set(rc={"figure.figsize": (13, 8)})

subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, kde=False)

plt.show()

In [None]:
x = safe_x['diagnosis_count']

sns.set(rc={"figure.figsize": (13, 8)})

subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, kde=False)

plt.show()

In [None]:
x = transfered_x['alerts_count']

sns.set(rc={"figure.figsize": (13, 8)})

subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, kde=False)

plt.show()

In [None]:
x = safe_x['alerts_count']

sns.set(rc={"figure.figsize": (13, 7)})

subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, kde=False)

plt.show()

In [None]:
fig = px.scatter(final_df, x="carelevelcode", y="diagnosis_count", color="transfered")
fig.show()
#kde density plot

In [None]:
fig = px.scatter(final_df, x="carelevelcode", y="alerts_count", color="transfered")
fig.show()

## Clustering 

In [None]:
""" 
- select random centroids
- we take each point and find the nearest centroid. We measure distance using Euclidean distance
- K-means is the most frequently used form of clustering due to its speed and simplicity. 
"""
#create kmeans object.
kmeans = KMeans(n_clusters=2)

# fit kmeans object to data
kmeans.fit(X)

# print location of clusters learned by kmeans object
print(kmeans.cluster_centers_)

# save new clusters for chart
y_km = kmeans.fit_predict(X)

In [None]:
_final_df = final_df.copy()
_final_df['transfered'] = y_km
fig = px.scatter(_final_df, x="diagnosis_count", y="alerts_count",  color="transfered")
fig.show()

In [None]:
fig = px.scatter(final_df, x="diagnosis_count", y="alerts_count", color="transfered")
fig.show()

### kernel density estimation (KDE)

In [None]:

colorscale = ['#7A4579', '#D56073']

fig = ff.create_2d_density(
    X['alerts_count'], Y, colorscale=colorscale,
    hist_color='rgb(255, 237, 222)', point_size=3
)

fig.show()
# py.iplot(fig, filename='histogram_subplots')

In [None]:

colorscale = ['#7A4579', '#D56073']

fig = ff.create_2d_density(
    X['diagnosis_count'], Y, colorscale=colorscale,
    hist_color='rgb(255, 237, 222)', point_size=3
)

fig.show()
# py.iplot(fig, filename='histogram_subplots')

### SEABORN
https://seaborn.pydata.org/generated/seaborn.kdeplot.html

In [None]:
data = safe_x['diagnosis_count']
sns.set_style('whitegrid')
sns.kdeplot(np.array(data), bw=0.5)

In [None]:
sns.set_style('whitegrid')
sns.kdeplot(data=transfered_x)

In [None]:
sns.set_style('whitegrid')
sns.kdeplot(data=safe_x)