# Response measure models

### Load relevant libraries

In [1]:
import pandas as pd
import numpy as np
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from tqdm import tqdm
import pickle as pkl
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
rs = 10

## Training data
Get training data generated from https://www.kaggle.com/josephassaker/covid19-global-dataset?select=worldometer_coronavirus_daily_data.csv. All features are calculated per 100k in the given country.

### Features:
- *country*: Country for which the data is registered for
- *date*: Entry date for registered data
- *active_cases*: Number of active cases of COVID-19 for the given date and country
- *cumulative_total_cases*: Number of cumulative cases up to the given date for the given country
- *cases_past_week*: Sum of registered cases of COVID-19 the past week (-7 days < t <= current date) for the given date and country
- *cases_2w_ago*: Sum of registered cases of COVID-19 for the previous week (-14 days < t <= -7 days) for the given date and country
- *cumulative_total_deaths*: Number of cumulative deaths up to the given date for the given country
- *deaths_past_week*: Sum of registered deaths from COVID-19 the past week (-7 days < t <= current date) for the given date and country
- *deaths_2w_ago*: Sum of registered deaths from COVID-19 for the previous week (-14 days < t <= -7 days) for the given date and country

In [3]:
df_training = pd.read_csv('data/response_measures/training_data_response_measures.csv', parse_dates=['date']).drop(columns=['response_measures'])

In [4]:
df_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106712 entries, 0 to 106711
Data columns (total 9 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   date                     106712 non-null  datetime64[ns]
 1   country                  106712 non-null  object        
 2   active_cases             106712 non-null  float64       
 3   cumulative_total_cases   106712 non-null  float64       
 4   cases_past_week          106712 non-null  float64       
 5   cases_2w_ago             106712 non-null  float64       
 6   cumulative_total_deaths  106712 non-null  float64       
 7   deaths_past_week         106712 non-null  float64       
 8   deaths_2w_ago            106712 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 7.3+ MB


## Target data
Use data from https://ourworldindata.org/policy-responses-covid to retrieve target response data for the models. Data exists between 2020-01-01 to 2021-04-29.

### Stringency
This is a composite measure based on nine response indicators including school closures, workplace closures, and
travel bans, rescaled to a value from 0 to 100 (100 = strictest). The nine metrics used to calculate the Stringency Index are: school closures; workplace closures; cancellation of public events; restrictions on public gatherings; closures of public transport; stay-at-home requirements; public information campaigns; restrictions on internal movements; and international travel controls. It’s important to note that this index simply records the strictness of government policies. It does not measure or imply the appropriateness or effectiveness of a country’s response. A higher score does not necessarily mean that a country’s response is ‘better’ than others lower on the index.

### Internal movement
Restrictions on internal movement during the COVID-19 pandemic. The measures are classified in the range 0-2:
- **0**: No measures
- **1**: Recommend movement restriction
- **2**: Restrict movement

### Public gatherings
Restrictions on public gatherings in the COVID-19 pandemic. Restrictions are given based on the size of public gatherings as follows:
- **0**: No measures
- **1**: Restrictions on very large gatherings (the limit is above 1000 people)
- **2**: gatherings between 100-1000 people
- **3**: gatherings between 10-100 people
- **4**: gatherings of less than 10 people

### School
School closures during the COVID-19 pandemic. The measures are classified in the range 0-3:
- **0**: No measures
- **1**: Recommended
- **2**: Required (only at some levels)
- **3**: Required (all levels)

### Workplace
Workplace closures during the COVID-19 pandemic. The measures are classified in the range 0-3:
- **0**: No measures
- **1**: Recommended
- **2**: Required for some
- **3**: Required for all but key workers

### Home
Stay-at-home requirements during the COVID-19 pandemic. The measures are classified in the range 0-3:
- **0**: No measures
- **1**: Recommended not to leave the house
- **2**: Required to not leave the house with exceptions for daily exercise, grocery shopping, and ‘essential’ trips
- **3**: Required to not leave the house with minimal exceptions (e.g. allowed to leave only once every few days, or only one person can leave at a time, etc.)


In [5]:
df_stringency = pd.read_csv('data/response_measures/covid-stringency-index.csv', parse_dates=['date'])
df_internal_movement = pd.read_csv('data/response_measures/internal-movement-covid.csv', parse_dates=['date'])
df_public_gathering = pd.read_csv('data/response_measures/public-gathering-rules-covid.csv', parse_dates=['date'])
df_school = pd.read_csv('data/response_measures/school-closures-covid.csv', parse_dates=['date'])
df_workplace = pd.read_csv('data/response_measures/workplace-closures-covid.csv', parse_dates=['date'])
df_home = pd.read_csv('data/response_measures/stay-at-home-covid.csv', parse_dates=['date'])

In [6]:
df_targets = df_stringency.merge(df_internal_movement, on=['country', 'code', 'date'], how='outer')
df_targets = df_targets.merge(df_public_gathering, on=['country', 'code', 'date'], how='outer')
df_targets = df_targets.merge(df_school, on=['country', 'code', 'date'], how='outer')
df_targets = df_targets.merge(df_workplace, on=['country', 'code', 'date'], how='outer')
df_targets = df_targets.merge(df_home, on=['country', 'code', 'date'], how='outer')

In [7]:
# Change names of some countries to equal the training data
country_mapper = {
    'Cape Verde': 'Cabo Verde',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Czechia': 'Czech Republic (Czechia)',
    'Democratic Republic of Congo': 'DR Congo',
    'Palestine': 'State of Palestine',
    'Timor': 'Timor-Leste'
}
df_targets.country = df_targets.country.replace(country_mapper)

In [8]:
df_targets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88591 entries, 0 to 88590
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   country                          88591 non-null  object        
 1   code                             88591 non-null  object        
 2   date                             88591 non-null  datetime64[ns]
 3   stringency_index                 84403 non-null  float64       
 4   restrictions_internal_movements  88083 non-null  float64       
 5   restriction_gatherings           88396 non-null  float64       
 6   school_closures                  84859 non-null  float64       
 7   workplace_closures               88367 non-null  float64       
 8   stay_home_requirements           88099 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 6.8+ MB


# Training the models
### Contact matrix models
We now want to train one model for each of the weights we want to predict. For the contact matrices, these are:
- **Home**: Weights for the matrix regarding at-home contacts (Target label: *stay_home_requirements*)
- **School**: Weights for the matrix regarding school contacts (Target label: *school_closures*)
- **Work**: Weights for the matrix regarding at-work contacts (Target label: *workplace_closures*)
- **Public**: Weights for the matrix regarding school contacts (Target label: *restriction_gatherings*)

### Movement model
For the inter region movement, we want to train a model that gives scaling factors given infection/death levels. We have one factor:
- **Movement**: Factor for scaling the total movement for the population in the SEAIR-model (Target label: *restrictions_internal_movements*)

As the training data contains data on dates where control measures are not recorded, we get NaN-values for the new dataframe. We assume that for the missinng target values, we can fill these with the closest preceeding control measure. We use the pandas fillna() function with method='bfill' to obtain this.

In [9]:
# Define useful functions before training
def get_model_data(target_label):
    df_target = df_targets[['country', 'date', target_label]] # Get relevant target from targets dataframe
    df_model = df_training.merge(df_target, on=['country', 'date'], how='left').set_index(['country', 'date']) # Merge training data with target on country and date
    df_model.fillna(method='bfill', inplace=True)
    return df_model

def split_data(df, target_label):
    y = df[target_label]
    X = df.drop([target_label], axis=1)
    X_mat = X.values
    X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.2, random_state=rs)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train, y_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, scaler

def train_model(X_train, y_train):
    model = MLPClassifier(random_state=rs, max_iter=1000)
    model.fit(X_train, y_train)
    print(model)
    return model

def score_model(model, X_train, y_train, X_test, y_test):
    print("Train accuracy:", model.score(X_train, y_train))
    print("Test accuracy:", model.score(X_test, y_test))
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

def save_model(fpath_model, fpath_scaler):
    pkl.dump(model, open(fpath_model, 'wb'))
    pkl.dump(scaler, open(fpath_scaler, 'wb'))

## Home model

In [10]:
target_label = 'stay_home_requirements'
df_model = get_model_data(target_label)
X_train, X_test, y_train, y_test, scaler = split_data(df_model, target_label)
model = train_model(X_train, y_train)
score_model(model, X_train, y_train, X_test, y_test)

MLPClassifier(max_iter=1000, random_state=10)
Train accuracy: 0.6173083906336023
Test accuracy: 0.6119570819472426
              precision    recall  f1-score   support

         0.0       0.65      0.83      0.73      9601
         1.0       0.66      0.20      0.30      4673
         2.0       0.53      0.65      0.59      6151
         3.0       0.77      0.15      0.25       918

    accuracy                           0.61     21343
   macro avg       0.65      0.46      0.47     21343
weighted avg       0.62      0.61      0.58     21343



In [11]:
save_model(fpath_model='models/home_weight_model.sav', fpath_scaler='models/home_weight_scaler.sav')

## School model

In [12]:
target_label = 'school_closures'
df_model = get_model_data(target_label)
X_train, X_test, y_train, y_test, scaler = split_data(df_model, target_label)
model = train_model(X_train, y_train)
score_model(model, X_train, y_train, X_test, y_test)

MLPClassifier(max_iter=1000, random_state=10)
Train accuracy: 0.6016586817228736
Test accuracy: 0.5940589420418873
              precision    recall  f1-score   support

         0.0       0.78      0.52      0.62      6230
         1.0       0.47      0.63      0.53      3733
         2.0       0.51      0.27      0.35      3770
         3.0       0.60      0.80      0.69      7610

    accuracy                           0.59     21343
   macro avg       0.59      0.55      0.55     21343
weighted avg       0.61      0.59      0.58     21343



In [13]:
save_model(fpath_model='models/school_measure_model.sav', fpath_scaler='models/school_measure_scaler.sav')

## Work model

In [14]:
target_label = 'workplace_closures'
df_model = get_model_data(target_label)
X_train, X_test, y_train, y_test, scaler = split_data(df_model, target_label)
model = train_model(X_train, y_train)
score_model(model, X_train, y_train, X_test, y_test)

MLPClassifier(max_iter=1000, random_state=10)
Train accuracy: 0.599034778432452
Test accuracy: 0.5992128566743194
              precision    recall  f1-score   support

         0.0       0.62      0.80      0.70      7674
         1.0       0.56      0.13      0.21      2997
         2.0       0.58      0.70      0.63      7964
         3.0       0.59      0.24      0.34      2708

    accuracy                           0.60     21343
   macro avg       0.59      0.47      0.47     21343
weighted avg       0.59      0.60      0.56     21343



In [15]:
save_model(fpath_model='models/work_measure_model.sav', fpath_scaler='models/work_measure_scaler.sav')

## Public model

In [16]:
target_label = 'workplace_closures'
df_model = get_model_data(target_label)
X_train, X_test, y_train, y_test, scaler = split_data(df_model, target_label)
model = train_model(X_train, y_train)
score_model(model, X_train, y_train, X_test, y_test)

MLPClassifier(max_iter=1000, random_state=10)
Train accuracy: 0.599034778432452
Test accuracy: 0.5992128566743194
              precision    recall  f1-score   support

         0.0       0.62      0.80      0.70      7674
         1.0       0.56      0.13      0.21      2997
         2.0       0.58      0.70      0.63      7964
         3.0       0.59      0.24      0.34      2708

    accuracy                           0.60     21343
   macro avg       0.59      0.47      0.47     21343
weighted avg       0.59      0.60      0.56     21343



In [17]:
save_model(fpath_model='models/public_measure_model.sav', fpath_scaler='models/public_measure_scaler.sav')

## Movement model

In [18]:
target_label = 'restrictions_internal_movements'
df_model = get_model_data(target_label)
X_train, X_test, y_train, y_test, scaler = split_data(df_model, target_label)
model = train_model(X_train, y_train)
score_model(model, X_train, y_train, X_test, y_test)

MLPClassifier(max_iter=1000, random_state=10)
Train accuracy: 0.6795909522191896
Test accuracy: 0.674319448999672
              precision    recall  f1-score   support

         0.0       0.68      0.88      0.77     11275
         1.0       0.57      0.14      0.23      3015
         2.0       0.67      0.57      0.62      7053

    accuracy                           0.67     21343
   macro avg       0.64      0.53      0.54     21343
weighted avg       0.66      0.67      0.64     21343



In [19]:
save_model(fpath_model='models/movement_measure_model.sav', fpath_scaler='models/movement_measure_scaler.sav')