[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/anawatbk/hospital-capacity-forecasting/blob/main/final_dev.ipynb)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Name: Anawat Putwanphen
----------

Research Question / Hypothesis
----
California COVID-19 Hospital bed capacity forecasting (weekly average).



Load Data
-----

COVID-19 Reported Patient Impact and Hospital Capacity by Facility by Department of Health and Human Services
<br>
https://healthdata.gov/dataset/covid-19-reported-patient-impact-and-hospital-capacity-facility

In [221]:
import pandas as pd
import numpy as np

from sklearn.compose            import *
from sklearn.preprocessing      import *
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.impute import *
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline 

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, SGDRegressor
from sklearn.impute import *
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor

In [51]:
# Load Data
data_source_path = '../data/reported_hospital_capacity_admissions_facility_level_weekly_average_timeseries_20210228.csv'
df = pd.read_csv(data_source_path, parse_dates = ['collection_week'])\
       .query('state == "CA"').sort_values(by=['hospital_pk', 'collection_week'])

In [52]:
def pre_pipeline_preprocessing(df):
    '''
    All Preprocessing process which are not supported by Pipeline
    1. drop rows of inconsistent covid cases report,
       For example, covid cases > combined cases (which consist of non-covid + covid cases)
    2. select rows of hospital who report covid cases in the last 4 months
       (2020-11-06 until 2021-02-19) 
    '''
    # Drop all inconsistent covid cases report rows 
    inconsistent_hospital_pk = df[(
        (df['inpatient_beds_used_7_day_sum'] / df['inpatient_beds_used_7_day_sum']) <
        (df['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum']
            / df['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        )]['hospital_pk'].unique()
    df = df[~np.isin(df['hospital_pk'], inconsistent_hospital_pk)]

    # keep only the hospitals who reported capacity every week for the last 4 month (2020-11-06 until 2021-02-19)
    max_week_count = df.loc[df['collection_week'] > '2020-11-01']\
                       .groupby('hospital_pk').count().max()['collection_week']
    hospital_pk_array = df.loc[df['collection_week'] > '2020-11-01']\
                          .groupby('hospital_pk').count().index.values
    complete_hostital_mask = (df.loc[df['collection_week'] > '2020-11-01']\
                                .groupby('hospital_pk').count()['collection_week'] == max_week_count)
    hospital_pk_array = hospital_pk_array[complete_hostital_mask]
    df = df[np.isin(df['hospital_pk'], hospital_pk_array)].copy()
    return df
    
def pre_pipeline_generate_multi_step_y(df):
    '''
    generate  1 month-ahead (4 step-ahead) output target.
    
    y | y+1 | y+2 | y+3
    '''
    
    # Transform target y into 4-step-ahead
    y = df[['hospital_pk', 'inpatient_beds_used_7_day_sum', 
            'inpatient_beds_used_7_day_coverage']].copy()
    y['inpatient_bed_used'] = (y['inpatient_beds_used_7_day_sum'] / 
                               y['inpatient_beds_used_7_day_coverage'])
    y['inpatient_bed_used'] = y['inpatient_bed_used'].fillna(0)
    y.loc[y['inpatient_bed_used'] < 0, 'inpatient_bed_used'] = 0 

    step_ahead = 3 # y_t, y_t+1, y_t+2, y_t+3
    step_backward = 4
    col = 'inpatient_bed_used'
    for step in range(1, step_ahead+1):
        y = y.assign(**{f'{col}+{step}': y.groupby('hospital_pk').shift(-step)[col]})
    ##for step in range(1, step_backward+1):
    ##    y = y.assign(**{f'{col}-{step}': y.groupby('hospital_pk').shift(step)[col]})
    # Drop last 4 week of each hospital 
    y = y.drop(['inpatient_beds_used_7_day_sum', 
                'inpatient_beds_used_7_day_coverage', 'hospital_pk'], 
               axis=1)
    no_target_week_mask = y.isnull().sum(axis=1) > 0
    df = df[~no_target_week_mask].copy()
    y = y.dropna()
    y = y[['inpatient_bed_used', 'inpatient_bed_used+1', 
           'inpatient_bed_used+2', 'inpatient_bed_used+3']]
    df['inpatient_bed_used'] = y['inpatient_bed_used']
    return df.reset_index(drop=True), y.reset_index(drop=True)

In [53]:
class CreateCalculatedColumns(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self):
        pass

    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        self.before_shape = X.shape
        # Inpatient Bed Capacity
        X['inpatient_bed_capacity'] = X['inpatient_beds_7_day_sum'] / X['inpatient_beds_7_day_coverage']
        # ICU Bed Capacity
        X['icu_bed_capacity'] = X['total_icu_beds_7_day_sum'] / X['total_icu_beds_7_day_coverage']
        # Inpatient Bed Used
        X['inpatient_bed_used'] = X['inpatient_beds_used_7_day_sum'] / X['inpatient_beds_used_7_day_coverage']
        # Adult covid inpatient   
        X['adult_inpatients_confirmed_n_suspected_covid'] = (
            X['total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum'] 
             / X['total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        # Pediatric covid inpatient   
        X['pediatric_inpatrients_confirmed_n_suspected_covid'] = (
            X['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum']
            / X['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        # total covid inpatient (adult+pediatric)
        X['total_inpatients_confirmed_n_suspected_covid'] = (
            X['adult_inpatients_confirmed_n_suspected_covid'] 
            + X['pediatric_inpatrients_confirmed_n_suspected_covid'])
        # elimanates -999999 error from data source
        X.loc[X['total_inpatients_confirmed_n_suspected_covid'] < 0, 'total_inpatients_confirmed_n_suspected_covid'] = 0
        X.loc[X['inpatient_bed_used'] < 0, 'inpatient_bed_used'] = 0
        # Average Inpatient Bed Used
        avg_inpatient_bed_used = X.groupby('hospital_pk').mean()[['inpatient_bed_used']]\
                                  .rename(columns={'inpatient_bed_used':'avg_inpatient_bed_used'})
        X = X.join(avg_inpatient_bed_used, on='hospital_pk')
        trans = X.copy() 
        self.after_shape = trans.shape
        return trans

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [54]:
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self, columns=[], feature=False):
        self.columns = columns
        self.feature = feature
    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        if self.feature:
            X = X.drop(self.columns, axis=1)
            return X
        else: 
            X = X[self.columns].copy() 
            return X

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [156]:
class GenerateLagValues(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self, columns=[], lags=4):
        self.columns = columns
        self.lags = lags
    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        self.before_shape = X.shape
        
        for col in self.columns:
            for lag in range(1, self.lags+1):
                X = X.assign(**{f'{col}-{lag}': X.groupby('hospital_pk').shift(lag)[col]})
        
        for col in self.columns:
            X = X.assign(delta_4 =  X[f'{col}-{1}'] - X[f'{col}-{self.lags}'])
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [56]:
class GroupByImputer(BaseEstimator, TransformerMixin):
    '''
    using median of group to impute (hospital_pk)
    fill by 0 if all values in the group are Nan
    '''
    def __init__(self, group_column, targets=[]):
        self.group_column = group_column
        self.targets = targets
    
    def fit(self, X, y=None):
        
        impute_map = X.groupby(self.group_column)[self.targets].median().reset_index(drop=False)
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):

        X = X.copy()
        
        for index, row in self.impute_map_.iterrows(): # loop through each hospital
            group_mask = row[self.group_column] == X[self.group_column]
            for col in self.targets:
                X.loc[group_mask, col] = X.loc[group_mask, col].fillna(row[col])
        X[self.targets] = X[self.targets].fillna(0)
        return X

In [70]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

## Preprocessing

In [63]:
relevant_columns = [
    'hospital_pk', 'collection_week', 'hospital_subtype', 'is_metro_micro',
    'inpatient_bed_capacity', 'icu_bed_capacity', 
    'inpatient_bed_used', 'avg_inpatient_bed_used',
    'total_inpatients_confirmed_n_suspected_covid',
    ]

non_feature_columns = ['hospital_pk', 'collection_week', 
                       'inpatient_bed_capacity', 'icu_bed_capacity']

time_dependant_columns = ['inpatient_bed_used', 'total_inpatients_confirmed_n_suspected_covid']

df = pre_pipeline_preprocessing(df)
X_original, y_original = pre_pipeline_generate_multi_step_y(df)

## Time Series train/test Split

In [121]:
def time_series_split(X, y):
    test_start_date =  X.collection_week.max()
    train_last_date  = test_start_date - pd.to_timedelta(4,unit='W')    
    test_idxs = X.loc[X.collection_week >= test_start_date].index
    train_idx = X.loc[X.collection_week <= train_last_date].index
    X_test, y_test = X.loc[test_idxs], y.loc[test_idxs]
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    return X_train, y_train, X_test, y_test

def time_series_cv(X, y):
    '''
    Time Series cross validation
    '''
    X = X.reset_index(drop=True)
    valid_start_date =  X.collection_week.max()
    train_last_date  = valid_start_date - pd.to_timedelta(4,unit='W') 
    valid_idxs = X.loc[X.collection_week >= valid_start_date].index
    train_idx = X.loc[X.collection_week <= train_last_date].index
    return [tuple([list(train_idx), list(valid_idxs)])]

In [65]:
X_train, y_train, X_test, y_test = time_series_split(X_original, y_original)

# Fit scikit-learn model



1. RandomForestRegressor (Multi-output)
2. GradientBoostingRegressor  (Multi-output)

In [166]:
feature_cat_cols = ['hospital_subtype', 'is_metro_micro']
feature_con_cols = ['inpatient_bed_used-1', 'inpatient_bed_used-2',
                    'inpatient_bed_used-3', 'inpatient_bed_used-4',
                    'total_inpatients_confirmed_n_suspected_covid-1',
                    'total_inpatients_confirmed_n_suspected_covid-2',
                    'total_inpatients_confirmed_n_suspected_covid-3',
                    'total_inpatients_confirmed_n_suspected_covid-4',
                    'avg_inpatient_bed_used', 'delta_4']

con_pipe = Pipeline([('scaler', StandardScaler())]) 

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy="most_frequent", add_indicator=True)),
                     ('ohe', OneHotEncoder())])

to_feature = ColumnTransformer([('continuous',  con_pipe, feature_con_cols),
                                ('categorical', cat_pipe, feature_cat_cols)])

# Final pipeline
pipeline = Pipeline([('calculateColumns', CreateCalculatedColumns()),
                             ('selectColumns_1', SelectColumnsTransfomer(relevant_columns)),
                             ('createTimelag', GenerateLagValues(time_dependant_columns)),
                             ('custom_imputer', GroupByImputer('hospital_pk', targets=feature_con_cols)),
                             ('selectColumns_2', SelectColumnsTransfomer(non_feature_columns, feature=True)),
                             ('finalProcessing', to_feature),
                             ('model', DummyEstimator)])
                             


In [217]:
# Search Best Models
cv_time_series = time_series_cv(X_train, y_train)
hyperparameters = [{'model': [Lasso(max_iter=3000)],
                    'model__alpha': np.arange(0.001,11,10)},
                   
                   {'model': [SGDRegressor(max_iter=3000, early_stopping=True)],
                    'model__loss': ['squared_loss', 'huber'],
                    'model__penalty': ['l2', 'elasticnet'],
                    'model__alpha': np.arange(0.001,11,10)},
                   
                   {'model': [RandomForestRegressor()],
                    'model__n_estimators': np.arange(25, 301, 25),
                    'model__max_depth': np.arange(5,31,5),
                    'model__min_samples_leaf': np.arange(1,16,3)},
    
                    {'model': [GradientBoostingRegressor()],
                     'model__loss': ['ls', 'huber'],
                     'model__n_estimators': np.arange(25,301,25),
                     'model__max_depth': np.arange(5,31,5),
                     'model__subsample': [0.8,0.9,1.0],
                     'model__min_samples_leaf': np.arange(1,16,3)}]

regr_rand_cv = RandomizedSearchCV(estimator=pipeline, 
                              param_distributions=hyperparameters, 
                              n_iter=30, 
                              cv=cv_time_series, 
                              scoring='neg_mean_absolute_error',
                              n_jobs=-1,
                              verbose=True)

In [218]:
regr_rand_cv.fit(X_train, y_train)

Fitting 1 folds for each of 30 candidates, totalling 30 fits


          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan -12.96169515
          nan          nan          nan          nan          nan]


RandomizedSearchCV(cv=[([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                         16, 17, 18, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, ...],
                        [22, 45, 68, 91, 114, 137, 160, 183, 206, 229, 252, 275,
                         298, 321, 344, 367, 390, 413, 436, 459, 482, 505, 528,
                         551, 574, 597, 620, 643, 666, 689, ...])],
                   estimator=Pipeline(steps=[('calculateColumns',
                                              CreateCalculatedColumns()),
                                             ('selectColumns_1',
                                              SelectColumnsTransfomer(...
                                         'model__n_estimators': array([ 25,  50,  75, 100, 125, 150, 175, 200, 225, 250, 275, 300])},
                                        {'model': [GradientBoostingRegressor()],
                                         'model__loss': ['ls', 'huber'],
                                      

Evaluation Metric
----

In [219]:
regr_rand_cv.best_params_

{'model__n_estimators': 150,
 'model__min_samples_leaf': 13,
 'model__max_depth': 30,
 'model': RandomForestRegressor(max_depth=30, min_samples_leaf=13, n_estimators=150)}

In [220]:
y_pred = regr_rand_cv.best_estimator_.predict(X_test)
mean_absolute_error(y_test.values, y_pred, multioutput='uniform_average')

9.852630000668665