## This notebook compares the output of test set on various models of different experiment-id

In [None]:
import pandas as pd
import pickle
from pathlib import Path
import timeit
import boto3
import subprocess
s3 = boto3.resource('s3')
import gc
import pickle
from sklearn.metrics import roc_auc_score
import os

In [None]:
from dataclasses import dataclass
from typing import Any

@dataclass
class BaseModel:
    """Class for keeping track of base models"""

    model_name: str
    model_type: str
    model: Any

    def predict(self, x) -> float:
        if self.model_type == "rf":
            return self.model.predict_proba(x)[:, 1]
        elif self.model_type == "lgb":
            return self.model.predict(x)
        else:
            raise NotImplementedError

In [None]:
# reading the test dataset.

processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)


with open(processed_path/'03-test_x.pickle','rb') as f: test_x = pickle.load(f)
with open(processed_path/'03-test_target_3_day.pickle','rb') as f: test_target_3_day = pickle.load(f)
with open(processed_path/'03-test_target_7_day.pickle','rb') as f: test_target_7_day = pickle.load(f)
with open(processed_path/'03-test_idens.pickle','rb') as f: test_idens = pickle.load(f)
    
with open(processed_path/'03-na_filler.pickle', 'rb') as f: na_filler = pickle.load(f)

* fill the experiment_id_name_dict with the expeirment and corresponding experimentname
* models of the experiment-ids will be tested against the test dataset.

In [None]:
experiment_id_name_dict ={
#     '60':'meridian_less_data'
#     '63': 'meridian_snf-and-alf_test',

}

In [None]:
def precision_recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.hospitalized_within_pred_range.cumsum()
    group.loc[:, "total_relevant"] = group.hospitalized_within_pred_range.sum()
    group.loc[:, "recall_at_k"] = group.hospitalized_cumsum / group.total_relevant

    return group.reset_index(drop=True)

* uncomment the code.
* manually change the experiment-id number and run the below command for all the experiment_id_keys


In [None]:
# !aws s3 cp 's3://saiva-models/63' /data/model_comparison/63/  --recursive --include=".pickle"

In [None]:
#     function fills the na of the testset with the na_filler.pickle of the models.
def fill_na( df, path, model_id):
    with open(f'{path}/{model_id}/artifacts/na_filler.pickle','rb') as f: 
        na_filler = pickle.load(f)

    return df.fillna(na_filler)


In [None]:
# function is replica of the 'objective' function of 04 notebook.
# here it is run on test dataset.

def main_testing_function(experiment_id, df):
    print(f'***********Processing for {experiment_id_name_dict[experiment_id]}******************')
    processed_path = Path(f'/data/model_comparison/{experiment_id}/')
    processed_path.mkdir(parents=True, exist_ok=True)
    model_ids = os.listdir(processed_path)
    print('model_ids----> ',model_ids)
    print('processed_path----> ',processed_path)
    dataframe_list = []
    for model_id in model_ids:
        test_x = df.copy()
        print('model_id--->', model_id)
        with open(f'{processed_path}/{model_id}/artifacts/{model_id}.pickle','rb') as f: 
            model = pickle.load(f)
        print('total_null_in_test---->', test_x.mean().mean())
        test_x = fill_na(test_x, processed_path, model_id)
        print('total_null_in_test_after fill na---->', test_x.mean().mean())
        test_preds = model.predict(test_x)
        total_test_aucroc = roc_auc_score(test_target_3_day, test_preds)
        test_base = test_idens.copy()
        test_base['predictionvalue'] = test_preds
        test_base['hospitalized_within_pred_range'] = test_target_3_day
        test_base['predictionrank'] = test_base.groupby(['censusdate', 'facilityid']).predictionvalue.rank(ascending=False)
        test_base = test_base.sort_values('predictionrank', ascending=True)

        performance_base = (
            test_base.groupby(["facilityid", "censusdate"])
            .apply(precision_recall_at_k)
            .reset_index(drop=True)
        )

        facility_pats = performance_base.groupby(['censusdate','facilityid']
                                                ).predictionrank.max().reset_index().groupby('facilityid').predictionrank.median().reset_index()

        for facilityid in sorted(test_idens.facilityid.unique()):
            mask = (test_idens.facilityid == facilityid).values
            k_at_15_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .15).values[0]

            rank_subset = performance_base.loc[(performance_base.facilityid==facilityid)]

            try:

                agg_recall_at_15_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].hospitalized_cumsum.sum() / rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].total_relevant.sum()
                )
                auc_roc_score = roc_auc_score(test_target_3_day[mask], test_preds[mask])

                dataframe_list.extend([[model_id, facilityid, auc_roc_score, agg_recall_at_15_percent]])
            except Exception as e:
                # workaround for infinity-benchmark because you cannot calculate facility level
                # metric for one facility.  This workaround will just skip calculating that
                # facility level metric - it will print the exception, but continue
                print(e)
                continue
    return dataframe_list

In [None]:
# The above function is being ran for each model of each experiment-id.
for experiment_id in experiment_id_name_dict.keys():
    dataframe_list = main_testing_function(experiment_id, test_x)
    df = pd.DataFrame(dataframe_list,columns=['modelid', 'facilityid', f'{experiment_id_name_dict[experiment_id]}_auc_roc_score',f'{experiment_id_name_dict[experiment_id]}_recall_at_15'])
    df.sort_values(by=['facilityid'],inplace=True)
    df.to_csv(f'{experiment_id_name_dict[experiment_id]}.csv',index=False)
