In [None]:
import pandas as pd
import json
import boto3
from eliot import log_message
pd.set_option('display.max_colwidth', -1)
import itertools
import numpy as np

## Experiment Knobs

* CLIENT_LIST = [ 'avante', 'champion', 'hsm', 'marquis', 'midwest', 'mmh', 'trio', 'uch', 'vintage' ]
* NOTE: lengthofstay filters rth events with duration less than equal to 'LENGTHOFSTAY'.
* To keep all rth events, make LENGTHOFSTAY=None

In [None]:
CLIENT_LIST = ['trio']
START_DATE = '2021-06-01'
END_DATE = '2021-08-31'
LENGTHOFSTAY = None
# LENGTHOFSTAY = 30


* Below dataframe provides information for transferevent and its reported event within the last 3 days.
* query resides in 01 notebook.


In [None]:
base_df = pd.read_csv('s3://saiva-playground/gill/RTH_data.csv')
base_df['dateoftransfer'] = pd.to_datetime(base_df['dateoftransfer'])
base_df['censusdate'] = pd.to_datetime(base_df['censusdate'])
base_df['lastadmissiondate'] = pd.to_datetime(base_df['lastadmissiondate'])
base_df['lengthofstay'] = (base_df['dateoftransfer'] - base_df['lastadmissiondate']).dt.days + 1
if LENGTHOFSTAY:
    base_df = base_df[base_df['lengthofstay']<=LENGTHOFSTAY]
base_df = base_df[
    (base_df['client'].isin(CLIENT_LIST)) &
    (base_df['dateoftransfer']>=START_DATE) &
    (base_df['dateoftransfer']<=END_DATE) 
]
base_df = base_df[['client','facilityname','facilityid','censusdate','dateoftransfer','masterpatientid','rank_cutoff','resident_transferred']]
base_df.head()

* dividing the dataframe to two dataframes on the basis of ranked and unranked transfers

In [None]:
unranked_base_df = base_df[base_df['rank_cutoff'].isna()]
base_df = base_df[~base_df['rank_cutoff'].isna()]
unranked_base_df = unranked_base_df.groupby(['client','facilityname','facilityid'])['resident_transferred'].count().reset_index()
unranked_base_df.head()

## Calculating Recall

In [None]:
base_recall_df = base_df.groupby(['client','facilityid','facilityname','masterpatientid', 'dateoftransfer','resident_transferred'])['rank_cutoff'].agg(['max','min']).reset_index()
base_recall_df.rename(columns={'max':'max_rank', 'min':'min_rank'},inplace=True)
base_recall_df.head()

In [None]:
"""
following code snippet perform the below operation for each date and each facilityid - 
1) It merges(right join) the dataframe slice(each date, each facilityid) 
with another dataframe 'one_to_max' which contains single column of 1 to max rank for that facility during the whole timeperiod.
2) 'client','facilityid','dateoftransfer' is filled by ffill.
3) 'rank' is copied on 'rank_cutoff' and all the null values are filled by 0.
All the dataframe slices are appended together to form a single dataframe.
"""

combined_list =[]
facilitynames = base_recall_df.facilityname.unique().tolist()
for date in pd.date_range(start=START_DATE, end=END_DATE):
    for facilityname in facilitynames:
        one_to_max = pd.Series(range(1, base_recall_df[base_recall_df['facilityname']==facilityname]['max_rank'].max().astype(int)+1))
        one_to_max = pd.DataFrame({'rank':one_to_max})
        
        specific_date_facility_df = base_recall_df[(base_recall_df['dateoftransfer']==date) & (base_recall_df['facilityname']==facilityname)]
        
        if len(specific_date_facility_df):
            specific_date_facility_df = specific_date_facility_df.merge(one_to_max,left_on='min_rank', right_on='rank', how='right')
            specific_date_facility_df.loc[:,['client', 'facilityname', 'facilityid','dateoftransfer']] = specific_date_facility_df.loc[:,['client', 'facilityname', 'facilityid','dateoftransfer']].ffill()
            specific_date_facility_df['resident_transferred'].fillna(0,inplace=True)
            specific_date_facility_df['min_rank'] = specific_date_facility_df['rank']
            specific_date_facility_df = specific_date_facility_df.drop('rank',axis=1)
            specific_date_facility_df = specific_date_facility_df.sort_values(by=['dateoftransfer','client','facilityname', 'facilityid','min_rank'])
            combined_list.append(specific_date_facility_df)
            
combined_base_recall_df = pd.concat(combined_list)
combined_base_recall_df = combined_base_recall_df.sort_values(by=['client','dateoftransfer','facilityname', 'facilityid','min_rank'])
combined_base_recall_df = combined_base_recall_df.reset_index(drop=True)
combined_base_recall_df = combined_base_recall_df[~combined_base_recall_df['client'].isna()]
combined_base_recall_df.head()

* hospitalized_cumsum signifies whether an event occured on or before that particular rank or not (on a particular date and facility).
* total_relevant signifies the total events that occurent on a particular date and facility.

In [None]:
def recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.resident_transferred.cumsum()
    group.loc[:, "total_relevant"] = group.resident_transferred.sum()
    return group.reset_index(drop=True)

performance_base = (
            combined_base_recall_df.groupby(["client", "facilityname", "facilityid", "dateoftransfer"])
            .apply(recall_at_k)
            .reset_index(drop=True)
        )

performance_base.head()

In [None]:
"""
'hospitalized_cumsum' signifies the cumulative summation of the patients transferred per facility per date
'total_relevant' signifies the total residents transferred to the hospital per facility per date
"""
def recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.resident_transferred.cumsum()
    group.loc[:, "total_relevant"] = group.resident_transferred.sum()
    return group.reset_index(drop=True)

performance_base = (
            combined_base_recall_df.groupby(["client", "facilityname", "facilityid", "dateoftransfer"])
            .apply(recall_at_k)
            .reset_index(drop=True)
        )


predicted_df = performance_base.groupby(['client','facilityname', 'facilityid','min_rank']).hospitalized_cumsum.sum().reset_index(name='total_residents_predicted')

transferred_df = performance_base.groupby(['client', 'facilityname', 'facilityid','min_rank']).total_relevant.sum().reset_index(name='total_residents_transferred')

if len(unranked_base_df):
    transferred_df = transferred_df.merge(unranked_base_df,how='inner', on=['client','facilityname','facilityid'])
    transferred_df['total_residents_transferred']+= transferred_df['resident_transferred']
    del transferred_df['resident_transferred']
    
recall_df = predicted_df.merge(transferred_df,how='inner',on=['client', 'facilityname', 'facilityid','min_rank'])
recall_df['recall_at_k'] = recall_df['total_residents_predicted']/recall_df['total_residents_transferred']
recall_df.head()

In [None]:
clientwise_recall_df = recall_df.groupby(['client','min_rank'])['total_residents_predicted','total_residents_transferred'].sum().reset_index()
clientwise_recall_df['recall_at_k'] = clientwise_recall_df['total_residents_predicted']/clientwise_recall_df['total_residents_transferred']
clientwise_recall_df.head(5)

## Calculating Precision


###### Calculating precision numerator

In [None]:
numerator_precision_df = base_df.groupby(["client", "facilityname", "facilityid",'rank_cutoff'])['rank_cutoff'].count().reset_index(name='rank_count')
numerator_precision_df.head()

In [None]:
combined_list = []
facilitynames = numerator_precision_df.facilityname.unique().tolist()
for date in pd.date_range(start=START_DATE, end=END_DATE):
    for facilityname in facilitynames:
        one_to_max = pd.Series(range(1, numerator_precision_df[numerator_precision_df['facilityname']==facilityname]['rank_cutoff'].max().astype(int)+1))
        one_to_max = pd.DataFrame({'rank':one_to_max})
        
        specific_date_facility_df = numerator_precision_df[(numerator_precision_df['facilityname']==facilityname)]
        
        if len(specific_date_facility_df):
            specific_date_facility_df = specific_date_facility_df.merge(one_to_max,left_on='rank_cutoff', right_on='rank', how='right')
            specific_date_facility_df.loc[:,['client', 'facilityname', 'facilityid']] = specific_date_facility_df.loc[:,['client', 'facilityname', 'facilityid',]].ffill()
            specific_date_facility_df['rank_count'].fillna(0,inplace=True)
            specific_date_facility_df['rank_cutoff'] = specific_date_facility_df['rank']
            specific_date_facility_df = specific_date_facility_df.drop('rank',axis=1)
            specific_date_facility_df = specific_date_facility_df.sort_values(by=['client','facilityname', 'facilityid','rank_cutoff'])
            combined_list.append(specific_date_facility_df)

concat_numerator_precision_df = pd.concat(combined_list)
concat_numerator_precision_df = concat_numerator_precision_df.sort_values(by=['client','facilityname', 'facilityid','rank_cutoff'])
concat_numerator_precision_df = concat_numerator_precision_df.drop_duplicates(subset=concat_numerator_precision_df.columns)
concat_numerator_precision_df = concat_numerator_precision_df.reset_index(drop=True).reset_index()


temp_rank_sum = concat_numerator_precision_df.groupby(['client','facilityname','facilityid'])['rank_count'].cumsum().reset_index(name='rank_lessthan_equalto_k')
numerator_precision_df = concat_numerator_precision_df.merge(temp_rank_sum,on='index').drop('index',axis=1)
numerator_precision_df['facilityid'] = numerator_precision_df['facilityid'].astype(int)
numerator_precision_df.head()

#### calculating clientwise precision numerator 

In [None]:
clientwise_numerator_precision_df = base_df.groupby(["client", "rank_cutoff"])['rank_cutoff'].count().reset_index(name='rank_count')

combined_list = []
clients = clientwise_numerator_precision_df.client.unique().tolist()
for date in pd.date_range(start=START_DATE, end=END_DATE):
    for client in clients:
        one_to_max = pd.Series(range(1, clientwise_numerator_precision_df[clientwise_numerator_precision_df['client']==client]['rank_cutoff'].max().astype(int)+1))
        one_to_max = pd.DataFrame({'rank':one_to_max})

        specific_date_facility_df = clientwise_numerator_precision_df[(clientwise_numerator_precision_df['client']==client)]

        if len(specific_date_facility_df):
            specific_date_facility_df = specific_date_facility_df.merge(one_to_max,left_on='rank_cutoff', right_on='rank', how='right')
            specific_date_facility_df.loc[:,['client']] = specific_date_facility_df.loc[:,['client']].ffill()
            specific_date_facility_df['rank_count'].fillna(0,inplace=True)
            specific_date_facility_df['rank_cutoff'] = specific_date_facility_df['rank']
            specific_date_facility_df = specific_date_facility_df.drop('rank',axis=1)
            specific_date_facility_df = specific_date_facility_df.sort_values(by=['client', 'rank_cutoff'])
            combined_list.append(specific_date_facility_df)

concat_clientwise_numerator_precision_df = pd.concat(combined_list)
concat_clientwise_numerator_precision_df = concat_clientwise_numerator_precision_df.sort_values(by=['client', 'rank_cutoff'])
concat_clientwise_numerator_precision_df = concat_clientwise_numerator_precision_df.drop_duplicates(subset=concat_clientwise_numerator_precision_df.columns)
concat_clientwise_numerator_precision_df = concat_clientwise_numerator_precision_df.reset_index(drop=True).reset_index()


temp_rank_sum = concat_clientwise_numerator_precision_df.groupby(['client'])['rank_count'].cumsum().reset_index(name='rank_lessthan_equalto_k')
clientwise_numerator_precision_df = concat_clientwise_numerator_precision_df.merge(temp_rank_sum,on='index').drop('index',axis=1)
clientwise_numerator_precision_df.head()

###### Calculating precision denominator

In [None]:
ranked_d = pd.read_csv('s3://saiva-playground/gill/ranked_d.csv')
ranked_d['censusdate'] = pd.to_datetime(ranked_d['censusdate'])
ranked_d = ranked_d[(ranked_d['client'].isin(CLIENT_LIST))&(ranked_d['censusdate']>=START_DATE)&(ranked_d['censusdate']<=END_DATE)]
ranked_d['max_ranked_d'] = ranked_d.groupby(['client','facilityid'])['ranked_d'].transform('max')
ranked_d.head()

In [None]:
def lambda_func(row):
    row['rank_per_day'] = list(range(1,row['ranked_d']+1)) + list([row['ranked_d']]*(row['max_ranked_d']-row['ranked_d']))
    return row

grouped_ranked_d = ranked_d.apply(lambda_func, axis=1)
grouped_ranked_d = grouped_ranked_d.groupby(['client','facilityid']).apply(lambda x: [sum(a) for a in zip(*x['rank_per_day'])]).reset_index(name='rank_per_day')
grouped_ranked_d.head()


##### calculating clientwise precision denominator

In [None]:

def lambda_func(row):
    row['clientwise_rank_per_day'] = list(range(1,row['ranked_d']+1)) + list([row['ranked_d']]*(row['clientwise_max_ranked_d']-row['ranked_d']))
    return row

ranked_d['clientwise_max_ranked_d'] = ranked_d.groupby(['client'])['ranked_d'].transform('max')
ranked_d = ranked_d.apply(lambda_func, axis=1)
clientwise_grouped_ranked_d = ranked_d.groupby(['client']).apply(lambda x: [sum(a) for a in zip(*x['clientwise_rank_per_day'])]).reset_index(name='clientwise_rank_per_day')
clientwise_grouped_ranked_d.head()


##### calculating precision

In [None]:
def func(row):
    row['min_k_ranked_d'] = grouped_ranked_d.loc[(grouped_ranked_d['client']==row['client'])&(grouped_ranked_d['facilityid']==row['facilityid'])]['rank_per_day'].values[0][row['rank_cutoff']-1]    
    return row


precision_df = numerator_precision_df.apply(func, axis=1)
precision_df['precision_at_k'] = precision_df['rank_lessthan_equalto_k']/precision_df['min_k_ranked_d']
precision_df.head()

##### calculating clientwise precision

In [None]:
def func(row):
    try:
        row['min_k_ranked_d'] = clientwise_grouped_ranked_d.loc[(clientwise_grouped_ranked_d['client']==row['client'])]['clientwise_rank_per_day'].values[0][row['rank_cutoff']-1]    
    except:
        row['min_k_ranked_d'] = np.nan
    return row


clientwise_precision_df = clientwise_numerator_precision_df.apply(func, axis=1)
clientwise_precision_df['precision_at_k'] = clientwise_precision_df['rank_lessthan_equalto_k']/clientwise_precision_df['min_k_ranked_d']
# clientwise_precision_df = clientwise_precision_df[~clientwise_precision_df['min_k_ranked_d'].isna()]
clientwise_precision_df.head()

In [None]:
recall_df.shape, precision_df.shape

In [None]:
precision_recall_at_k = recall_df.merge(precision_df, how='inner', left_on = ['client', 'facilityname', 'facilityid', 'min_rank'], right_on=['client', 'facilityname', 'facilityid','rank_cutoff'])
precision_recall_at_k.to_csv(f'precision_recall_at_k_{START_DATE}_{END_DATE}.csv',index=False)
precision_recall_at_k.head()

In [None]:
clientwise_precision_recall_at_k = clientwise_recall_df.merge(clientwise_precision_df, how='inner', left_on = ['client', 'min_rank'], right_on=['client', 'rank_cutoff'])
clientwise_precision_recall_at_k.to_csv(f'clientwise_precision_recall_at_k_{START_DATE}_{END_DATE}.csv',index=False)
clientwise_precision_recall_at_k.head()