# Creating training datasets for SEIR(D) modeling

In [None]:
import os
import datetime

import pandas as pd
import numpy as np

In [None]:
df_w = pd.read_csv('clean_data_cro.csv' )

In [None]:
df_w

In [None]:
# create additional column for age groups
# age groups: Y <= 47.0; O > 47.0
df_w.loc[df_w.age <= 47.0, 'age_group'] = 'Y'
df_w.loc[df_w.age > 47.0, 'age_group'] = 'O'

In [None]:
df_w

## Without age grouping

In [None]:
# assemble new dataframe for SEIR(D) modeling without age grouping
columns_dtypes_dict = {
    'testing_date': datetime.datetime,
    'total_tests': float,
    'confirmed_positives': float,
    'confirmed_deceased': float,
    'estimate_recovered': float,
}

# create empty dataframe with predifined data types
columns_dtypes_list = [(k, v) for k, v in columns_dtypes_dict.items()]
dtypes = np.dtype(columns_dtypes_list)
empty_matrix = np.empty(0, dtype=dtypes)
df_m = pd.DataFrame(empty_matrix)

df_m.testing_date = df_w.testing_date.sort_values().unique()
df_m.testing_date = pd.to_datetime(df_m.testing_date)
df_m.total_tests = df_w.groupby('testing_date').count().testing_result.values
df_m.confirmed_positives = df_w.groupby('testing_date').sum().testing_result.values
df_m.confirmed_deceased = df_w.groupby('testing_date').sum().deceased.values

In [None]:
# dataframe for SEIR(D) modeling without age grouping for 1st epi wave
eff_start_date = datetime.datetime(2020, 2, 25)
eff_end_date_initial = datetime.datetime(2020, 6, 1)
df_m_initial = df_m.loc[df_m.testing_date <= eff_end_date_initial]
df_m_initial

In [None]:
# dataframe for SEIR(D) modeling without age grouping for 2nd epi wave
eff_rec_date = datetime.datetime(2020, 6, 1)
df_m_rec = df_m.loc[df_m.testing_date >= eff_rec_date]
df_m_rec

In [None]:
# obtain recovered data
import requests
import io

RECOVERED_URL = 'https://raw.githubusercontent.com/antelk/covid-19/master/data/CRO/recovered_cases.dat'
response = requests.get(RECOVERED_URL)
estimate_recovered_all = np.loadtxt(io.BytesIO(response.content))
df_m_initial.estimate_recovered = estimate_recovered_all[:len(df_m_initial)]
df_m_rec.estimate_recovered = estimate_recovered_all[len(df_m_initial):len(df_m_initial)+len(df_m_rec)]

In [None]:
df_m_initial

In [None]:
df_m_rec

In [None]:
df_m_initial.to_csv('1st_wave_data_cro.csv', index=False)

In [None]:
df_m_rec.to_csv('2nd_wave_data_cro.csv', index=False)

## With age grouping

In [None]:
# assemble new dataframe for SEIR(D) modeling with age grouping
iterables = [df_w.testing_date.sort_values().unique(), df_w.age_group.unique()]
# columns and dtypes
columns_dtypes_dict = {
    'total_tests': float,
    'confirmed_positives': float,
    'confirmed_deceased': float,
    'estimate_recovered': float,
}
df_m_g = pd.DataFrame(
    index=pd.MultiIndex.from_product(iterables, names=['testing_date', 'age_group']), 
    columns=list(columns_dtypes_dict.keys()),
    dtype=float,
)

In [None]:
df_m_g.total_tests = df_w.groupby(['testing_date', 'age_group']).count().testing_result.values
df_m_g.confirmed_positives = df_w.groupby(['testing_date', 'age_group']).sum().testing_result.values
df_m_g.confirmed_deceased = df_w.groupby(['testing_date', 'age_group']).sum().deceased.values

In [None]:
df_m_g

In [None]:
df_m_g.reset_index(inplace=True,)
df_m_g.testing_date = pd.to_datetime(df_m_g.testing_date)
df_m_g

In [None]:
# dataframe for SEIR(D) modeling without age grouping for 1st epi wave
eff_start_date = datetime.datetime(2020, 2, 25)
eff_end_date_initial = datetime.datetime(2020, 6, 1)
df_m_g_initial = df_m_g.loc[df_m_g.testing_date <= eff_end_date_initial]
df_m_g_initial

In [None]:
# df_m_g_initial.to_csv('1st_wave_age_grouped_data_cro.csv', index=False)