In [8]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

In [9]:
data = pd.read_csv('../data/regional_multivariate_data.csv')
data['date'] = pd.to_datetime(data['date'])

In [24]:
def calculate_feature_importance(data, threshold=0.1, target="new_confirmed", plot=False):
    X = data.drop([target, 'region'], axis=1)
    y = data[target]
    # model = xgb.XGBRegressor(objective ='reg:squarederror')
    model = xgb.XGBRegressor()
    model.fit(X, y)

    stats = pd.DataFrame({'importance': model.feature_importances_}, index=X.columns)
    stats = stats.sort_values('importance', ascending=False)
    print(stats.iloc[:5])
    print(stats[stats["importance"] >= threshold])

    if plot:
        plt.figure(figsize=(28, 8))
        # plt.title(name)
        plt.xticks(rotation=45)
        plt.bar(X.columns, model.feature_importances_)
        plt.show()

In [11]:
# split data into train and test sets
groups = data.groupby('region')
target = "new_confirmed"
for name, group in groups:
    print(name)
    # split data into X and y
    group = group.set_index('date')
    X = group.drop([target, 'region'], axis=1)
    y = group[target]
    # model = xgb.XGBRegressor(objective ='reg:squarederror')
    model = xgb.XGBRegressor()
    model.fit(X, y)

    stats = pd.DataFrame({'importance': model.feature_importances_}, index=X.columns)
    stats = stats.sort_values('importance', ascending=False)
    print(stats.iloc[:5])

    # plt.figure(figsize=(28, 8))
    # plt.title(name)
    # plt.xticks(rotation=45)
    # plt.bar(X.columns, model.feature_importances_)
    # plt.show()

    print('\n\n')

East North Central
                                             importance
current_hospitalized_patients                  0.384517
cumulative_persons_fully_vaccinated_moderna    0.146271
workplace_closing                              0.097206
new_hospitalized_patients                      0.062908
new_deceased                                   0.037831



East South Central
                                       importance
current_hospitalized_patients            0.377714
cumulative_confirmed                     0.151957
cumulative_vaccine_doses_administered    0.089578
new_hospitalized_patients                0.067852
new_deceased                             0.033674



Mid-Atlantic
                               importance
stay_at_home_requirements        0.524141
new_hospitalized_patients        0.350233
workplace_closing                0.053362
cumulative_confirmed             0.019563
current_hospitalized_patients    0.012737



Mountain
                           importance
new_h

## explore logs

In [15]:
# split data into train and test sets
groups = data.groupby('region')
target = "new_confirmed"
for name, group in groups:
    print(name)
    # split data into X and y
    group = group.set_index('date')
    X = group.drop([target, 'region'], axis=1)
    X = X.replace(0, np.nan)
    X = X.dropna()  # or use another method to fill nan values
    X = np.log(X)
    y = group.loc[X.index, target]
    # model = xgb.XGBRegressor(objective ='reg:squarederror')
    model = xgb.XGBRegressor()
    model.fit(X, y)

    stats = pd.DataFrame({'importance': model.feature_importances_}, index=X.columns)
    stats = stats.sort_values('importance', ascending=False)
    print(stats.iloc[:5])

    # plt.figure(figsize=(28, 8))
    # plt.title(name)
    # plt.xticks(rotation=45)
    # plt.bar(X.columns, model.feature_importances_)
    # plt.show()

    print('\n\n')

East North Central


  result = func(self.values, **kwargs)


                                      importance
new_hospitalized_patients               0.521556
cumulative_persons_vaccinated           0.349303
cumulative_confirmed                    0.054645
new_persons_fully_vaccinated_janssen    0.011019
new_persons_fully_vaccinated_moderna    0.009378



East South Central


  result = func(self.values, **kwargs)


                                        importance
cumulative_vaccine_doses_administered     0.629367
new_hospitalized_patients                 0.158942
cumulative_persons_fully_vaccinated       0.122420
cumulative_confirmed                      0.037467
new_vaccine_doses_administered_moderna    0.018005



Mid-Atlantic


  result = func(self.values, **kwargs)


                                             importance
new_hospitalized_patients                      0.501195
cumulative_persons_vaccinated                  0.298588
cumulative_confirmed                           0.084465
cumulative_persons_fully_vaccinated_janssen    0.059213
new_persons_fully_vaccinated_janssen           0.011713



Mountain


  result = func(self.values, **kwargs)


                                            importance
cumulative_persons_vaccinated                 0.499167
new_hospitalized_patients                     0.294200
cumulative_persons_fully_vaccinated_pfizer    0.133099
cumulative_persons_fully_vaccinated           0.045388
cumulative_confirmed                          0.016396



New England


  result = func(self.values, **kwargs)


                                              importance
cumulative_vaccine_doses_administered_pfizer    0.301423
stringency_index                                0.259080
cumulative_confirmed                            0.158443
cumulative_persons_vaccinated                   0.105916
new_hospitalized_patients                       0.060354



Pacific


  result = func(self.values, **kwargs)


                                importance
new_hospitalized_patients         0.448093
contact_tracing                   0.200261
cumulative_confirmed              0.118866
new_vaccine_doses_administered    0.036357
dew_point                         0.026099



South Atlantic


  result = func(self.values, **kwargs)


                              importance
new_hospitalized_patients       0.568712
cumulative_confirmed            0.385497
new_deceased                    0.006513
school_closing                  0.004649
new_persons_fully_vaccinated    0.004332



West North Central


  result = func(self.values, **kwargs)


                           importance
contact_tracing              0.477028
new_hospitalized_patients    0.314620
cumulative_confirmed         0.047007
income_support               0.036148
relative_humidity            0.030816



West South Central


  result = func(self.values, **kwargs)


                                 importance
new_hospitalized_patients          0.714090
relative_humidity                  0.129717
current_intensive_care_patients    0.038959
cumulative_confirmed               0.022739
current_hospitalized_patients      0.014698





## check out lag terms 

In [13]:
enc = pd.read_csv('../data/regional_datasets/multivariate/enc_data_multi.csv')
esc = pd.read_csv('../data/regional_datasets/multivariate/esc_data_multi.csv')
mid_atlantic = pd.read_csv('../data/regional_datasets/multivariate/mid_atlantic_data_multi.csv')
mountain = pd.read_csv('../data/regional_datasets/multivariate/mountain_data_multi.csv')
new_england = pd.read_csv('../data/regional_datasets/multivariate/new_england_data_multi.csv')
pacific = pd.read_csv('../data/regional_datasets/multivariate/pacific_data_multi.csv')
south_atlantic = pd.read_csv('../data/regional_datasets/multivariate/south_atlantic_data_multi.csv')
wnc = pd.read_csv('../data/regional_datasets/multivariate/wnc_data_multi.csv')
wsc = pd.read_csv('../data/regional_datasets/multivariate/wsc_data_multi.csv')
datasets = [enc, esc, mid_atlantic, mountain, new_england, pacific, south_atlantic, wnc, wsc]
for df in datasets:
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

In [25]:
for df in datasets:
    print(df['region'].iloc[0])
    calculate_feature_importance(df, threshold=0.05, target="new_confirmed")
    
    print('\n')

East North Central
                                 importance
7_day_shift                        0.654887
current_intensive_care_patients    0.072584
school_closing                     0.031775
contact_tracing                    0.028264
current_hospitalized_patients      0.024152
                                 importance
7_day_shift                        0.654887
current_intensive_care_patients    0.072584


East South Central
                      importance
7_day_avg               0.596956
7_day_shift             0.139206
cancel_public_events    0.044987
school_closing          0.030782
new_deceased            0.024986
             importance
7_day_avg      0.596956
7_day_shift    0.139206


Mid-Atlantic
                               importance
7_day_avg                        0.837090
2_day_shift                      0.077875
1_day_shift                      0.042509
cumulative_confirmed             0.016230
cumulative_persons_vaccinated    0.006906
             importance
7_d

In [30]:
def calculate_importance(data, threshold=0.1, target="new_confirmed", plot=False):
    X = data.drop([target, 'region'], axis=1)
    y = data[target]
    # model = xgb.XGBRegressor(objective ='reg:squarederror')
    model = xgb.XGBRegressor()
    model.fit(X, y)

    stats = pd.DataFrame({'importance': model.feature_importances_}, index=X.columns)
    stats = stats.sort_values('importance', ascending=False)
    stats["cum_importance"] = np.cumsum(stats['importance'])

    # Determine the number of features needed for 95% of total importance
    n_95_percent = np.where(stats["cum_importance"] > 0.95)[0][0] + 1

    # Get the most important features accounting for 95% of the total
    important_features = stats[:n_95_percent]

    print(f"Number of features for 95% importance: {n_95_percent}, out of {len(stats)}")
    print(f"Important features: {list(important_features.index)}")

for df in datasets:
    print(df['region'].iloc[0])
    calculate_importance(df, threshold=0.05, target="new_confirmed")
    
    print('\n')

East North Central
Number of features for 95% importance: 19, out of 51
Important features: ['7_day_shift', 'current_intensive_care_patients', 'school_closing', 'contact_tracing', 'current_hospitalized_patients', 'new_hospitalized_patients', 'new_persons_fully_vaccinated_moderna', '1_day_shift', 'stay_at_home_requirements', 'new_vaccine_doses_administered_moderna', 'cumulative_persons_fully_vaccinated_moderna', 'cumulative_persons_fully_vaccinated', '7_day_avg', '2_day_shift', 'new_deceased', 'cumulative_confirmed', 'income_support', 'testing_policy', 'cumulative_persons_vaccinated']


East South Central
Number of features for 95% importance: 14, out of 52
Important features: ['7_day_avg', '7_day_shift', 'cancel_public_events', 'school_closing', 'new_deceased', '2_day_shift', 'income_support', 'minimum_temperature_celsius', '1_day_shift', 'average_temperature_celsius', 'new_vaccine_doses_administered_janssen', 'current_intensive_care_patients', 'rainfall_mm', 'stringency_index']


Mid-