<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Final-Project-Check-in" data-toc-modified-id="Final-Project-Check-in-1">Final Project Check-in</a></span></li><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Final Project Check-in
------

Name: Anawat Putwanphen
----------





Research Question / Hypothesis
----
California COVID-19 Hospital bed capacity forecasting (weekly average).



Load Data
-----

In [32]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.impute import *
from imblearn.pipeline import make_pipeline #
from sklearn.pipeline import Pipeline



In [33]:
# Load Data
data_source_path = '../data/reported_hospital_capacity_admissions_facility_level_weekly_average_timeseries_20210228.csv'
df = pd.read_csv(data_source_path, parse_dates = ['collection_week'])\
       .query('state == "CA"').sort_values(by=['hospital_pk', 'collection_week'])

In [34]:
def pre_pipeline_preprocessing(df):
    '''
    All Preprocessing process which are not supported by Pipeline
    1. drop rows of inconsistent covid cases report,
       For example, covid cases > combined cases (which consist of non-covid + covid cases)
    2. select rows of hospital who report covid cases in the last 4 months
       (2020-11-06 until 2021-02-19) 
    '''
    ### Drop all inconsistent covid cases report rows 
    inconsistent_hospital_pk = df[(
        (df['inpatient_beds_used_7_day_sum'] / df['inpatient_beds_used_7_day_sum']) <
        (df['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum']
            / df['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        )]['hospital_pk'].unique()
    df = df[~np.isin(df['hospital_pk'], inconsistent_hospital_pk)]

    #### keep only the hospitals who reported capacity every week for the last 4 month (2020-11-06 until 2021-02-19)
    max_week_count = df.loc[df['collection_week'] > '2020-11-01']\
                       .groupby('hospital_pk').count().max()['collection_week']
    hospital_pk_array = df.loc[df['collection_week'] > '2020-11-01']\
                          .groupby('hospital_pk').count().index.values
    complete_hostital_mask = (df.loc[df['collection_week'] > '2020-11-01']\
                                .groupby('hospital_pk').count()['collection_week'] == max_week_count)
    hospital_pk_array = hospital_pk_array[complete_hostital_mask]
    df = df[np.isin(df['hospital_pk'], hospital_pk_array)].copy()
    return df
    
def pre_pipeline_generate_multi_step_y(df):
    '''
    generate  1 month-ahead (4 step-ahead) output target.
    
    y | y+1 | y+2 | y+3
    '''
    
    # transform target y into 4-step-ahead
    y = df[['hospital_pk', 'inpatient_beds_used_7_day_sum', 
            'inpatient_beds_used_7_day_coverage']].copy()
    y['inpatient_bed_used'] = (y['inpatient_beds_used_7_day_sum'] / 
                               y['inpatient_beds_used_7_day_coverage'])
    y['inpatient_bed_used'] = y['inpatient_bed_used'].fillna(0)
    y.loc[y['inpatient_bed_used'] < 0, 'inpatient_bed_used'] = 0 

    step_ahead = 3 # y_t, y_t+1, y_t+2, y_t+3
    step_backward = 4
    col = 'inpatient_bed_used'
    for step in range(1, step_ahead+1):
        y = y.assign(**{f'{col}+{step}': y.groupby('hospital_pk').shift(-step)[col]})
    for step in range(1, step_backward+1):
        y = y.assign(**{f'{col}-{step}': y.groupby('hospital_pk').shift(step)[col]})
    # drop last 4 week of each hospital 
    y = y.drop(['inpatient_beds_used_7_day_sum', 
                'inpatient_beds_used_7_day_coverage', 'hospital_pk'], 
               axis=1)
    no_target_week_mask = y.isnull().sum(axis=1) > 0
    df = df[~no_target_week_mask].copy()
    y = y.dropna()
    y = y[['inpatient_bed_used', 'inpatient_bed_used+1', 
           'inpatient_bed_used+2', 'inpatient_bed_used+3']]
    df['inpatient_bed_used'] = y['inpatient_bed_used']
    return df.reset_index(drop=True), y.reset_index(drop=True)

In [50]:
class CreateCalculatedColumns(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self):
        pass

    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        self.before_shape = X.shape
        # Inpatient Bed Capacity
        X['inpatient_bed_capacity'] = X['inpatient_beds_7_day_sum'] / X['inpatient_beds_7_day_coverage']
        # ICU Bed Capacity
        X['icu_bed_capacity'] = X['total_icu_beds_7_day_sum'] / X['total_icu_beds_7_day_coverage']
        # Inpatient Bed Used
        X['inpatient_bed_used'] = X['inpatient_beds_used_7_day_sum'] / X['inpatient_beds_used_7_day_coverage']
        # Adult covid inpatient   
        X['adult_inpatients_confirmed_n_suspected_covid'] = (
            X['total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum'] 
             / X['total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        # Pediatric covid inpatient   
        X['pediatric_inpatrients_confirmed_n_suspected_covid'] = (
            X['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum']
            / X['total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage'])
        # total covid inpatient (adult+pediatric)
        X['total_inpatients_confirmed_n_suspected_covid'] = (
            X['adult_inpatients_confirmed_n_suspected_covid'] 
            + X['pediatric_inpatrients_confirmed_n_suspected_covid'])
        # elimanates -999999 error from data source
        X.loc[X['total_inpatients_confirmed_n_suspected_covid'] < 0, 'total_inpatients_confirmed_n_suspected_covid'] = 0
        X.loc[X['inpatient_bed_used'] < 0, 'inpatient_bed_used'] = 0
        
        trans = X.copy() 
        self.after_shape = trans.shape
        return trans

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [72]:
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self, columns=[], feature=False):
        self.columns = columns
        self.feature = feature
    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        if self.feature:
            X = X.drop(self.columns, axis=1)
            return X
        else: 
            X = X[self.columns].copy() 
            return X

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [44]:
class GenerateLagValues(BaseEstimator, TransformerMixin):
    """ A DataFrame transformer that provides column selection
    
    Allows to select columns by name from pandas dataframes in scikit-learn
    pipelines.
    
    Parameters
    ----------
    columns : list of str, names of the dataframe columns to select
        Default: [] 
    
    """
    def __init__(self, columns=[], lags=4):
        self.columns = columns
        self.lags = lags
    def transform(self, X, **transform_params):
        """ Selects columns of a DataFrame
        
        Parameters
        ----------
        X : pandas DataFrame
            
        Returns
        ----------
        
        trans : pandas DataFrame
            contains selected columns of X      
        """
        self.before_shape = X.shape
        
        for col in self.columns:
            for lag in range(1, self.lags+1):
                X = X.assign(**{f'{col}-{lag}': X.groupby('hospital_pk').shift(lag)[col]})
        
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None, **fit_params):
        """ Do nothing function
        
        Parameters
        ----------
        X : pandas DataFrame
        y : default None
                
        
        Returns
        ----------
        self  
        """
        return self

In [200]:
class GroupByImputer(BaseEstimator, TransformerMixin):
    '''
    using median of group to impute (hospital_pk)
    fill by 0 if all values in the group are Nan
    '''
    def __init__(self, group_column, targets=[]):
        self.group_column = group_column
        self.targets = targets
    
    def fit(self, X, y=None):
        
        impute_map = X.groupby(self.group_column)[self.targets].median().reset_index(drop=False)
        self.impute_map_ = impute_map
        
        return self 
    
    def transform(self, X, y=None):

        X = X.copy()
        
        for index, row in self.impute_map_.iterrows(): # loop through each hospital
            group_mask = row[self.group_column] == X[self.group_column]
            for col in self.targets:
                X.loc[group_mask, col] = X.loc[group_mask, col].fillna(row[col])
        X[self.targets] = X[self.targets].fillna(0)
        return X

## Preprocessing

In [35]:
relevant_columns = [
    'hospital_pk', 'collection_week', 'hospital_subtype', 'is_metro_micro',
    'inpatient_bed_capacity', 'icu_bed_capacity', 
    'inpatient_bed_used', 'total_inpatients_confirmed_n_suspected_covid']

non_feature_columns = ['hospital_pk', 'collection_week', 
                       'inpatient_bed_capacity', 'icu_bed_capacity']

time_dependant_columns = ['inpatient_bed_used', 'total_inpatients_confirmed_n_suspected_covid']

df = pre_pipeline_preprocessing(df)
X_original, y_original = pre_pipeline_generate_multi_step_y(df)

## Time Series train/test Split

In [9]:
def time_series_split(X, y):
    test_start_date =  X.collection_week.max()
    train_last_date  = test_start_date - pd.to_timedelta(4,unit='W')    
    test_idxs = X.loc[X.collection_week >= test_start_date].index
    train_idx = X.loc[X.collection_week <= train_last_date].index
    X_test, y_test = X.loc[test_idxs], y.loc[test_idxs]
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    return X_train, y_train, X_test, y_test

def time_series_cv(X, y):
    '''
    Time Series cross validation
    '''
    X = X.reset_index(drop=True)
    valid_start_date =  X.collection_week.max()
    train_last_date  = valid_start_date - pd.to_timedelta(4,unit='W') 
    valid_idxs = X.loc[X.collection_week >= valid_start_date].index
    train_idx = X.loc[X.collection_week <= train_last_date].index
    return [list(train_idx), list(valid_idxs)]

In [51]:
X_train, y_train, X_test, y_test = time_series_split(X_original, y_original)

# Fit ML models



In [173]:
from   sklearn.compose            import *
from   sklearn.ensemble           import RandomForestRegressor 
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.metrics            import balanced_accuracy_score # Evaluation metric 2.0 
from   sklearn.pipeline           import Pipeline
from   sklearn.preprocessing      import *



In [210]:
feature_cat_cols = ['hospital_subtype', 'is_metro_micro']
feature_con_cols = ['inpatient_bed_used-1', 'inpatient_bed_used-2',
                    'inpatient_bed_used-3', 'inpatient_bed_used-4',
                    'total_inpatients_confirmed_n_suspected_covid-1',
                    'total_inpatients_confirmed_n_suspected_covid-2',
                    'total_inpatients_confirmed_n_suspected_covid-3',
                    'total_inpatients_confirmed_n_suspected_covid-4']

con_pipe = Pipeline([('scaler', StandardScaler())]) 

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy="most_frequent", add_indicator=True)),
                     ('ohe', OneHotEncoder())])

to_feature = ColumnTransformer([('continuous',  con_pipe, feature_con_cols),
                                ('categorical', cat_pipe, feature_cat_cols)])

# Final pipeline
pipeline = Pipeline([('calculateColumns', CreateCalculatedColumns()),
                             ('selectColumns_1', SelectColumnsTransfomer(relevant_columns)),
                             ('createTimelag', GenerateLagValues(time_dependant_columns)),
                             ('custom_imputer', GroupByImputer('hospital_pk', targets=feature_con_cols)),
                             ('selectColumns_2', SelectColumnsTransfomer(non_feature_columns, feature=True)),
                             ('finalProcessing', to_feature)])
                             


In [226]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('calculateColumns', CreateCalculatedColumns()),
                ('selectColumns_1',
                 SelectColumnsTransfomer(columns=['hospital_pk',
                                                  'collection_week',
                                                  'hospital_subtype',
                                                  'is_metro_micro',
                                                  'inpatient_bed_capacity',
                                                  'icu_bed_capacity',
                                                  'inpatient_bed_used',
                                                  'total_inpatients_confirmed_n_suspected_covid'])),
                ('createTimelag',
                 GenerateLagValues(columns=['inpatie...
                                                   'total_inpatients_confirmed_n_suspected_covid-1',
                                                   'total_inpatients_confirmed_n_suspected_covid-2',
                

## Old code

In [15]:
# validate set --> week: 2021-01-22
date_mask = (df_features['collection_week'] == '2021-01-22')

validate = df_features.loc[date_mask].drop('collection_week', axis=1)
train = df_features.loc[~date_mask].drop('collection_week', axis=1)

In [16]:
# target: inpatient_bed_used
X_train = train.drop(['inpatient_bed_used', 'ratio_covid_inpatients', 'total_inpatients_confirmed_n_suspected_covid'], axis=1).copy()
y_train = train['inpatient_bed_used'].values

In [17]:
# target: inpatient_bed_used
X_test = validate.drop(['inpatient_bed_used', 'ratio_covid_inpatients', 'total_inpatients_confirmed_n_suspected_covid'], axis=1).copy()
y_test = validate['inpatient_bed_used'].values

In [18]:
categorical_columns = ((X_train.dtypes == object) | (X_train.dtypes == bool))

In [19]:
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy="most_frequent", add_indicator=True)),
                     ('ohe', OneHotEncoder())
                     ])
con_pipe = Pipeline([('imputer', SimpleImputer(strategy='median', add_indicator=True)),
                      ('scaler', StandardScaler())])  
preprocessing = ColumnTransformer([('continuous',  con_pipe, ~categorical_columns),
                                   ('categorical', cat_pipe,  categorical_columns)])
# regressor
reg = RandomForestRegressor(n_estimators=200, min_samples_leaf=9, n_jobs=-1) 

pipe = Pipeline([('preprocessing', preprocessing), 
                 ('classifier', reg)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('continuous',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  hospital_subtype                                      False
inpatient_bed_used_lag_1                               True
inpatient_bed_used_lag_2                               True
inpatient_bed_used_lag_3                               True
inpatient_bed_used_lag_4                               True
is_metro_micro                                        False
ratio_cov...
ratio_covid_inpatients_lag_4     

Evaluation Metric
----

In [20]:
from  sklearn.metrics import mean_absolute_error

In [21]:
# check mean of predicted value
print(f'Average Predicted value: {y_test.mean():.2f}')

Average Predicted value: 149.54


In [22]:
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 10.80
