# Creating training datasets for SEIR(D)

In [1]:
import os
import datetime

import pandas as pd
import numpy as np

In [2]:
DATA_DIR = 'data'
CLEANED_FILE = 'clean_data_cro.csv' 
CLEANED_DATA_PATH = os.path.join(os.pardir, DATA_DIR, CLEANED_FILE)

df_w = pd.read_csv(CLEANED_DATA_PATH)

In [3]:
df_w

Unnamed: 0,id,age,sex,location,county,testing_date,testing_result,testing_institute,hospitalization,respirator,deceased,total_comorbidities
0,000D955C52,50.0,M,RIJEKA,PRIMORSKO-GORANSKA,2020-07-04,False,MEDICINSKI FAKULTET RIJEKA,False,False,False,7
1,000FE7B77A,83.0,M,POŽEGA,POŽEŠKO-SLAVONSKA,2020-08-19,False,O.B.POŽEGA,False,False,False,12
2,0022233DC0,42.0,M,OSIJEK,OSJEČKO-BARANJSKA,2020-08-17,False,ZJZ OSJEČKO-BARANJSKE ŽUPANIJE,False,False,False,9
3,0026A3B8BB,89.0,F,KARLOVAC,KARLOVAČKA,2020-06-02,False,KL.ZA INFEKTIVNE BOLESTI,False,False,False,7
4,00317D99E3,17.0,F,OSIJEK,OSJEČKO-BARANJSKA,2020-06-30,False,K.B.C.OSIJEK,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...
199742,FFB0780383,52.0,M,ZAGREB,GRADZAGREB,2020-08-29,False,ZJZ REPUBLIKE HRVATSKE,False,False,False,3
199743,FFB6289722,29.0,F,VUKOVAR,VUKOVARSKO-SRIJEMSKA,2020-08-20,False,K.B.C.OSIJEK,False,False,False,7
199744,FFD598C2D0,24.0,M,ZADAR,ZADARSKA,2020-07-14,False,ZJZ ZDRAVSTVO ZADAR,False,False,False,1
199745,FFDE31DCA8,54.0,F,PULA,,2020-08-27,False,ZJZ ISTARSKE ŽUPANIJE,False,False,False,0


In [4]:
# create additional column for age groups
# age groups: Y <= 47.0; O > 47.0
df_w.loc[df_w.age <= 47.0, 'age_group'] = 'Y'
df_w.loc[df_w.age > 47.0, 'age_group'] = 'O'

In [5]:
df_w

Unnamed: 0,id,age,sex,location,county,testing_date,testing_result,testing_institute,hospitalization,respirator,deceased,total_comorbidities,age_group
0,000D955C52,50.0,M,RIJEKA,PRIMORSKO-GORANSKA,2020-07-04,False,MEDICINSKI FAKULTET RIJEKA,False,False,False,7,O
1,000FE7B77A,83.0,M,POŽEGA,POŽEŠKO-SLAVONSKA,2020-08-19,False,O.B.POŽEGA,False,False,False,12,O
2,0022233DC0,42.0,M,OSIJEK,OSJEČKO-BARANJSKA,2020-08-17,False,ZJZ OSJEČKO-BARANJSKE ŽUPANIJE,False,False,False,9,Y
3,0026A3B8BB,89.0,F,KARLOVAC,KARLOVAČKA,2020-06-02,False,KL.ZA INFEKTIVNE BOLESTI,False,False,False,7,O
4,00317D99E3,17.0,F,OSIJEK,OSJEČKO-BARANJSKA,2020-06-30,False,K.B.C.OSIJEK,False,False,False,6,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199742,FFB0780383,52.0,M,ZAGREB,GRADZAGREB,2020-08-29,False,ZJZ REPUBLIKE HRVATSKE,False,False,False,3,O
199743,FFB6289722,29.0,F,VUKOVAR,VUKOVARSKO-SRIJEMSKA,2020-08-20,False,K.B.C.OSIJEK,False,False,False,7,Y
199744,FFD598C2D0,24.0,M,ZADAR,ZADARSKA,2020-07-14,False,ZJZ ZDRAVSTVO ZADAR,False,False,False,1,Y
199745,FFDE31DCA8,54.0,F,PULA,,2020-08-27,False,ZJZ ISTARSKE ŽUPANIJE,False,False,False,0,O


## Without age grouping

In [6]:
# assemble new dataframe for SEIR(D) modeling without age grouping
columns_dtypes_dict = {
    'testing_date': datetime.datetime,
    'total_tests': float,
    'confirmed_positives': float,
    'confirmed_deceased': float,
    'estimate_recovered': float,
}

# create empty dataframe with predifined data types
columns_dtypes_list = [(k, v) for k, v in columns_dtypes_dict.items()]
dtypes = np.dtype(columns_dtypes_list)
empty_matrix = np.empty(0, dtype=dtypes)
df_m = pd.DataFrame(empty_matrix)

df_m.testing_date = df_w.testing_date.sort_values().unique()
df_m.testing_date = pd.to_datetime(df_m.testing_date)
df_m.total_tests = df_w.groupby('testing_date').count().testing_result.values
df_m.confirmed_positives = df_w.groupby('testing_date').sum().testing_result.values
df_m.confirmed_deceased = df_w.groupby('testing_date').sum().deceased.values

In [7]:
# dataframe for SEIR(D) modeling without age grouping for 1st epi wave
eff_start_date = datetime.datetime(2020, 2, 25)
eff_end_date_initial = datetime.datetime(2020, 6, 1)
df_m_initial = df_m.loc[df_m.testing_date <= eff_end_date_initial]
df_m_initial

Unnamed: 0,testing_date,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
0,2020-02-25,32,1.0,0.0,
1,2020-02-26,41,1.0,0.0,
2,2020-02-27,28,2.0,0.0,
3,2020-02-28,13,2.0,0.0,
4,2020-02-29,10,0.0,0.0,
...,...,...,...,...,...
93,2020-05-28,456,1.0,0.0,
94,2020-05-29,561,0.0,0.0,
95,2020-05-30,258,0.0,0.0,
96,2020-05-31,245,0.0,0.0,


In [8]:
# dataframe for SEIR(D) modeling without age grouping for 2nd epi wave
eff_rec_date = datetime.datetime(2020, 6, 1)
df_m_rec = df_m.loc[df_m.testing_date >= eff_rec_date]
df_m_rec

Unnamed: 0,testing_date,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
97,2020-06-01,471,0.0,0.0,
98,2020-06-02,460,0.0,0.0,
99,2020-06-03,318,1.0,0.0,
100,2020-06-04,290,0.0,0.0,
101,2020-06-05,321,0.0,0.0,
...,...,...,...,...,...
193,2020-09-05,3812,197.0,1.0,
194,2020-09-06,1806,67.0,0.0,
195,2020-09-07,4138,255.0,0.0,
196,2020-09-08,3324,286.0,0.0,


In [9]:
# obtain recovered data
import requests
import io

RECOVERED_URL = 'https://raw.githubusercontent.com/antelk/covid-19/master/data/CRO/recovered_cases.dat'
response = requests.get(RECOVERED_URL)
estimate_recovered_all = np.loadtxt(io.BytesIO(response.content))
df_m_initial.estimate_recovered = estimate_recovered_all[:len(df_m_initial)]
df_m_rec.estimate_recovered = estimate_recovered_all[len(df_m_initial):len(df_m_initial)+len(df_m_rec)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
df_m_initial

Unnamed: 0,testing_date,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
0,2020-02-25,32,1.0,0.0,0.0
1,2020-02-26,41,1.0,0.0,0.0
2,2020-02-27,28,2.0,0.0,0.0
3,2020-02-28,13,2.0,0.0,0.0
4,2020-02-29,10,0.0,0.0,0.0
...,...,...,...,...,...
93,2020-05-28,456,1.0,0.0,2051.0
94,2020-05-29,561,0.0,0.0,2059.0
95,2020-05-30,258,0.0,0.0,2063.0
96,2020-05-31,245,0.0,0.0,2072.0


In [11]:
df_m_rec

Unnamed: 0,testing_date,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
97,2020-06-01,471,0.0,0.0,2088.0
98,2020-06-02,460,0.0,0.0,2095.0
99,2020-06-03,318,1.0,0.0,2105.0
100,2020-06-04,290,0.0,0.0,2113.0
101,2020-06-05,321,0.0,0.0,2121.0
...,...,...,...,...,...
193,2020-09-05,3812,197.0,1.0,9008.0
194,2020-09-06,1806,67.0,0.0,9266.0
195,2020-09-07,4138,255.0,0.0,9553.0
196,2020-09-08,3324,286.0,0.0,9833.0


In [12]:
df_m_initial.to_csv(os.path.join(os.pardir, DATA_DIR, 'initial_timeseries_SEIRD.csv'), index=False)

In [13]:
df_m_rec.to_csv(os.path.join(os.pardir, DATA_DIR, '2nd_wave_timeseries_SEIRD.csv'), index=False)

## With age grouping

In [14]:
# assemble new dataframe for SEIR(D) modeling with age grouping
iterables = [df_w.testing_date.sort_values().unique(), df_w.age_group.unique()]
# columns and dtypes
columns_dtypes_dict = {
    'total_tests': float,
    'confirmed_positives': float,
    'confirmed_deceased': float,
    'estimate_recovered': float,
}
df_m_g = pd.DataFrame(
    index=pd.MultiIndex.from_product(iterables, names=['testing_date', 'age_group']), 
    columns=list(columns_dtypes_dict.keys()),
    dtype=float,
)

In [15]:
df_m_g.total_tests = df_w.groupby(['testing_date', 'age_group']).count().testing_result.values
df_m_g.confirmed_positives = df_w.groupby(['testing_date', 'age_group']).sum().testing_result.values
df_m_g.confirmed_deceased = df_w.groupby(['testing_date', 'age_group']).sum().deceased.values

In [16]:
df_m_g

Unnamed: 0_level_0,Unnamed: 1_level_0,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
testing_date,age_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-25,O,15,0.0,0.0,
2020-02-25,Y,17,1.0,0.0,
2020-02-26,O,18,1.0,0.0,
2020-02-26,Y,23,0.0,0.0,
2020-02-27,O,4,1.0,0.0,
...,...,...,...,...,...
2020-09-07,Y,2123,142.0,0.0,
2020-09-08,O,1604,116.0,0.0,
2020-09-08,Y,1720,170.0,0.0,
2020-09-09,O,48,4.0,0.0,


In [17]:
df_m_g.reset_index(inplace=True,)
df_m_g.testing_date = pd.to_datetime(df_m_g.testing_date)
df_m_g

Unnamed: 0,testing_date,age_group,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
0,2020-02-25,O,15,0.0,0.0,
1,2020-02-25,Y,17,1.0,0.0,
2,2020-02-26,O,18,1.0,0.0,
3,2020-02-26,Y,23,0.0,0.0,
4,2020-02-27,O,4,1.0,0.0,
...,...,...,...,...,...,...
391,2020-09-07,Y,2123,142.0,0.0,
392,2020-09-08,O,1604,116.0,0.0,
393,2020-09-08,Y,1720,170.0,0.0,
394,2020-09-09,O,48,4.0,0.0,


In [18]:
# dataframe for SEIR(D) modeling without age grouping for 1st epi wave
eff_start_date = datetime.datetime(2020, 2, 25)
eff_end_date_initial = datetime.datetime(2020, 6, 1)
df_m_g_initial = df_m_g.loc[df_m_g.testing_date <= eff_end_date_initial]
df_m_g_initial

Unnamed: 0,testing_date,age_group,total_tests,confirmed_positives,confirmed_deceased,estimate_recovered
0,2020-02-25,O,15,0.0,0.0,
1,2020-02-25,Y,17,1.0,0.0,
2,2020-02-26,O,18,1.0,0.0,
3,2020-02-26,Y,23,0.0,0.0,
4,2020-02-27,O,4,1.0,0.0,
...,...,...,...,...,...,...
191,2020-05-30,Y,99,0.0,0.0,
192,2020-05-31,O,128,0.0,0.0,
193,2020-05-31,Y,117,0.0,0.0,
194,2020-06-01,O,275,0.0,0.0,


In [19]:
df_m_g_initial.to_csv(os.path.join(os.pardir, DATA_DIR, 'initial_timeseries_SEIRD_age_groups.csv'), index=False)