# Predicting Re-Opening Success

In [64]:
import itertools

from matplotlib.pylab import plt
from matplotlib.ticker import FuncFormatter
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib import pyplot, lines
from matplotlib.patches import Patch
import matplotlib

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import RFECV

from sklearn.metrics import confusion_matrix

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS



from scipy.stats import ttest_ind

import researchpy as rpy

from loguru import logger



import pandas as pd
import numpy as np

import seaborn as sns

import datetime 


%matplotlib inline

# Parameters

In [65]:
dataset = 'eu_us'  # Will be used to tag the saved files

lockdown_dataset = '../data/processed/expanded_lockdowns.pkl'

from_rebound_days, to_rebound_days = 28, 85

In [66]:
efs_dataset = '../data/processed/efs_{}.pkl'.format(dataset)

# Load Lockdown Data

In [67]:
logger.info('Loading lockdown data @ %s' % lockdown_dataset)

expanded_lockdowns = pd.read_pickle(lockdown_dataset)
expanded_lockdowns.shape

2020-07-31 13:46:42.166 | INFO     | __main__:<module>:1 - Loading lockdown data @ ../data/expanded_lockdowns.pkl


(15533, 68)

In [68]:
# Focus on individual countries rather than regional aggregations.
ignore_regional_aggregations = expanded_lockdowns['aggregation']!=expanded_lockdowns['region']
expanded_lockdowns = expanded_lockdowns[ignore_regional_aggregations].set_index('aggregation')
expanded_lockdowns.shape

(13849, 67)

# Select the Lockdowns to Use

In [69]:
if dataset == 'eu_us':
    use_expanded_lockdowns = expanded_lockdowns[(expanded_lockdowns['region'].isin(['eu', 'us']))]
    
else:
    use_expanded_lockdowns = expanded_lockdowns
    
    
use_expanded_lockdowns.shape

(6404, 67)

In [70]:
# TODO: The last two lockdowns for each country are duplicates. Drop one. Need to check why this is the case.
use_expanded_lockdowns = use_expanded_lockdowns.drop_duplicates()
use_expanded_lockdowns.shape, use_expanded_lockdowns.duplicated().sum()

((6398, 67), 0)

# Predicting Re-Opening Success

## CV Evaluation

In [71]:
use_features = [
    'lockdown_min_mobility_level', 'lockdown_mean_mobility_level', 

    'lockdown_duration_days', 
    
    'lockdown_days_to_peak_cases', 'lockdown_days_from_peak_cases', 
    'lockdown_entry_level_cases', 'lockdown_exit_level_cases', 
    'lockdown_peak_value_cases_per_million', 'lockdown_mean_value_cases_per_million',   
    
    'lockdown_days_to_peak_deaths', 'lockdown_days_from_peak_deaths', 
    'lockdown_entry_level_deaths', 'lockdown_exit_level_deaths', 
    'lockdown_peak_value_deaths_per_100k', 'lockdown_mean_value_deaths_per_100k', 
    
]

len(use_features)

15

### Test EU/US Lockdowns
We need to pick a fixed number of days after reopening and then we will evaluate classifcation operformance wrt this number of days.

In [72]:
rebound_days = 42
use_lockdowns_for_days = use_expanded_lockdowns[use_expanded_lockdowns['rebound_duration_days']==rebound_days]

X = use_lockdowns_for_days[use_features]

y = use_lockdowns_for_days['is_increasing_rebound']

scaled_X = pd.DataFrame(preprocessing.scale(X), columns=X.columns, index=X.index)

clf = GradientBoostingClassifier()

scores = cross_validate(clf, scaled_X, y, scoring='accuracy', cv=10, n_jobs=-1)
scores['test_score'].mean()

0.7696428571428571

## Brute-Force Feature Selection Analysis
We using the `mlxtend` library to do an exhaustive feature selection alaysis. This takes the form of a simple wrapper that can be used to wrap a CV evaluation for a given classifier.

### The Exhaustive Feature Search
We use the EFS wrapper to perform a 10-fold CV for all possible combinations of features in `use_features` and we perform this for a range of different rebound days.

In [60]:
logger.info('Running exhaustive feature search (%s - %s days. This will take a while ... (~12 hours)' % (from_rebound_days, to_rebound_days))

efs_results = []

check_rebound_days = range(from_rebound_days, to_rebound_days, 1)

for rebound_days in check_rebound_days:
    
    if rebound_days//10==0:
        logger.info('EFS for %s rebound days for %s lockdowns' % (rebound_days, len(use_lockdowns_for_days)))


    
    # The lockdowns for the number of rebound days.
    use_lockdowns_for_days = use_expanded_lockdowns[use_expanded_lockdowns['rebound_duration_days']==rebound_days]

    # The feature and target class data
    X = use_lockdowns_for_days[use_features]
    y = use_lockdowns_for_days['is_increasing_rebound']

    scaled_X = pd.DataFrame(preprocessing.scale(X), columns=X.columns, index=X.index)

    clf = GradientBoostingClassifier()

    efs = EFS(clf, 
               min_features=1,
               max_features=len(use_features),
               scoring='accuracy',
               print_progress=False,
               cv=10, 
               n_jobs=-1)
    
    efs = efs.fit(scaled_X, y)
    
    max_class_prob = max(use_lockdowns_for_days['is_successful_rebound'].mean(), 1-use_lockdowns_for_days['is_successful_rebound'].mean())
    
    efs_results.append((rebound_days, len(use_lockdowns_for_days), max_class_prob, efs))


2020-07-28 13:23:43.539 | INFO     | __main__:<module>:1 - Running exhaustive feature search (28 - 85 days. This will take a while ... (~12 hours)


### The EFS results

In [61]:
def get_efs_results(efs_results):
    
    # Collect the DFs from the EFS results, adding the rebound days.
    efs_dfs = []
    for (rebound_days, size, max_class_prob, efs_result) in efs_results:
        efs_result_df = pd.DataFrame.from_dict(efs_result.get_metric_dict()).T
        efs_result_df['rebound_days'] = rebound_days
        efs_result_df['dataset_size'] = size
        efs_result_df['max_class_prob'] = max_class_prob
        efs_dfs.append(efs_result_df)

    # Combine into a single df
    efs_df = pd.concat(efs_dfs, ignore_index=True)
    
    # Add/fix various features
    efs_df['n'] = efs_df['feature_names'].map(len)
    
    efs_df['rank'] = efs_df['avg_score'].rank(ascending=False, pct=True)

    # For some reason these cols are of type object which messes with aggregation so change them to floats
    # as they should have been in the fitsy place.

    efs_df ['avg_score'] = efs_df ['avg_score'].astype(float)
    efs_df ['ci_bound'] = efs_df ['ci_bound'].astype(float)
    efs_df ['std_dev'] = efs_df ['std_dev'].astype(float)
    efs_df ['std_err'] = efs_df ['std_err'].astype(float)

    return efs_df.sort_values(by=['rebound_days', 'avg_score'], ascending=False).reset_index().drop('index', axis='columns')

logger.info('Finished with %s results' % len(efs_df))

2020-07-28 13:49:34.810 | INFO     | __main__:<module>:30 - Finished with 65534 results


In [62]:
efs_df = get_efs_results(efs_results)
efs_df.head()

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err,rebound_days,dataset_size,max_class_prob,n,rank
0,"(6, 7)","[1.0, 0.875, 0.875, 0.75, 0.75, 1.0, 0.875, 1....",0.898214,"(lockdown_exit_level_cases, lockdown_peak_valu...",0.069941,0.09417,0.03139,29,77,0.545455,2,7.6e-05
1,"(1, 5, 6, 14)","[1.0, 1.0, 0.875, 0.75, 0.75, 0.875, 0.875, 1....",0.898214,"(lockdown_mean_mobility_level, lockdown_entry_...",0.069941,0.09417,0.03139,29,77,0.545455,4,7.6e-05
2,"(5, 6, 8, 9)","[1.0, 1.0, 0.875, 0.75, 0.75, 1.0, 0.75, 1.0, ...",0.898214,"(lockdown_entry_level_cases, lockdown_exit_lev...",0.081336,0.109512,0.036504,29,77,0.545455,4,7.6e-05
3,"(3, 4, 5, 6, 9, 12, 14)","[1.0, 1.0, 0.875, 0.75, 0.75, 0.875, 0.875, 1....",0.898214,"(lockdown_days_to_peak_cases, lockdown_days_fr...",0.069941,0.09417,0.03139,29,77,0.545455,7,7.6e-05
4,"(6,)","[1.0, 1.0, 0.75, 0.75, 0.75, 0.75, 0.875, 1.0,...",0.8875,"(lockdown_exit_level_cases,)",0.087584,0.117925,0.039308,29,77,0.545455,1,0.000252


# Save EFS Data

In [63]:
logger.info('Saving EFS dataset -> %s' % efs_dataset)

efs_df.to_pickle(efs_dataset)
efs_df.shape

2020-07-28 13:49:49.274 | INFO     | __main__:<module>:1 - Saving EFS dataset -> ../data/efs_eu_us.pkl


(65534, 12)