# Simple Data

The goal of this Notebook is simplify the data to be directly readable, as done in exp001 (this is very similar).

In [1]:
INHABITANTS_GERMANY = 83.2E6 # https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Bevoelkerungsstand/_inhalt.html

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
corona_data = pd.read_csv("../dat/CoronaData.csv", parse_dates=["date"])
weather_data = pd.read_csv("../dat/WeatherData.csv", parse_dates=["date"])

In [4]:
corona_data.head()

Unnamed: 0.1,Unnamed: 0,id_county,name_county,id_state,name_state,cases,deaths,date,cum_cases,cum_deaths
0,0,1001.0,SK Flensburg,1.0,Schleswig-Holstein,0.0,0.0,2020-01-02,0.0,0.0
1,1,1001.0,SK Flensburg,1.0,Schleswig-Holstein,0.0,0.0,2020-01-03,0.0,0.0
2,2,1001.0,SK Flensburg,1.0,Schleswig-Holstein,0.0,0.0,2020-01-04,0.0,0.0
3,3,1001.0,SK Flensburg,1.0,Schleswig-Holstein,0.0,0.0,2020-01-05,0.0,0.0
4,4,1001.0,SK Flensburg,1.0,Schleswig-Holstein,0.0,0.0,2020-01-06,0.0,0.0


In [5]:
weather_data.head()

Unnamed: 0,date,air temperature
0,2019-01-01 01:00:00,6.504
1,2019-01-01 02:00:00,6.358333
2,2019-01-01 03:00:00,6.258333
3,2019-01-01 04:00:00,6.141667
4,2019-01-01 05:00:00,6.070833


## Simplify data

For this first experiment, we will simplify the data by reducing it to daily precision and germanywide data.

### Corona Data

In [6]:
corona_data_simple = corona_data.groupby("date").sum()[["cases", "deaths"]]
corona_data_simple

Unnamed: 0_level_0,cases,deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,1.0,0.0
2020-01-03,0.0,0.0
2020-01-04,0.0,0.0
2020-01-05,0.0,0.0
2020-01-06,0.0,0.0
...,...,...
2022-01-16,33898.0,7.0
2022-01-17,76094.0,25.0
2022-01-18,111752.0,10.0
2022-01-19,91985.0,0.0


In order to be more useful, the data has to be smoothed over seven days

In [7]:
cases = corona_data_simple.cases.to_numpy()
cases_smoothed = np.convolve(cases, np.ones(7), mode="same")
deaths = corona_data_simple.deaths.to_numpy()
deaths_smoothed = np.convolve(deaths, np.ones(7), mode="same")

corona_data_simple["cases_smoothed"] = cases_smoothed / 7
corona_data_simple["deaths_smoothed"] = deaths_smoothed / 7

Now we can calculate the case fatality rate. Since the values are not reliable in the beginning, we will set $cfr = 0$ if there have been few (e.g. 2) new cases that day (this should only occur before the first wave).

In [8]:
offset = 14

In [9]:
cases_smoothed_modified = corona_data_simple["cases_smoothed"].apply(lambda x: x if x > 2 else np.inf)
cfr = corona_data_simple["deaths_smoothed"][offset:] / cases_smoothed_modified[:-offset]

In [10]:
corona_data_simple["cfr"] = cfr

In [11]:
corona = corona_data_simple.fillna(method="ffill").fillna(method="bfill")

In [12]:
corona

Unnamed: 0_level_0,cases,deaths,cases_smoothed,deaths_smoothed,cfr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,1.0,0.0,0.142857,0.000000,0.000000
2020-01-03,0.0,0.0,0.142857,0.000000,0.000000
2020-01-04,0.0,0.0,0.142857,0.000000,0.000000
2020-01-05,0.0,0.0,0.142857,0.000000,0.000000
2020-01-06,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...
2022-01-16,33898.0,7.0,75881.285714,13.571429,0.001265
2022-01-17,76094.0,25.0,63947.428571,10.714286,0.001265
2022-01-18,111752.0,10.0,52658.714286,7.714286,0.001265
2022-01-19,91985.0,0.0,44818.428571,6.000000,0.001265


### Weather data

In [13]:
weather_data

Unnamed: 0,date,air temperature
0,2019-01-01 01:00:00,6.504000
1,2019-01-01 02:00:00,6.358333
2,2019-01-01 03:00:00,6.258333
3,2019-01-01 04:00:00,6.141667
4,2019-01-01 05:00:00,6.070833
...,...,...
27018,2022-01-30 19:00:00,3.028000
27019,2022-01-30 20:00:00,2.504000
27020,2022-01-30 21:00:00,2.352000
27021,2022-01-30 22:00:00,2.096000


Filter to only daytime

In [14]:
weather_data = weather_data[(weather_data.date.apply(lambda x: x.hour) >= 6) & (weather_data.date.apply(lambda x: x.hour) <= 20)]
weather_data

Unnamed: 0,date,air temperature
5,2019-01-01 06:00:00,6.000000
6,2019-01-01 07:00:00,5.862500
7,2019-01-01 08:00:00,5.795833
8,2019-01-01 09:00:00,5.866667
9,2019-01-01 10:00:00,6.020833
...,...,...
27015,2022-01-30 16:00:00,4.640000
27016,2022-01-30 17:00:00,4.124000
27017,2022-01-30 18:00:00,3.544000
27018,2022-01-30 19:00:00,3.028000


In [15]:
weather = weather_data.set_index("date").groupby(lambda x: f"{x.year}-{x.month:02d}-{x.day:02d}").mean()
weather.index.names = ["date"]

In [16]:
num_convolve = 5

weather["air temperature"] = np.convolve(
    weather["air temperature"], np.ones(num_convolve), "same"
) / num_convolve

In [17]:
weather.head()

Unnamed: 0_level_0,air temperature
date,Unnamed: 1_level_1
2019-01-01,1.33642
2019-01-02,1.634444
2019-01-03,2.482333
2019-01-04,2.060833
2019-01-05,2.452626


## Combining the data

In [18]:
weather.head()

Unnamed: 0_level_0,air temperature
date,Unnamed: 1_level_1
2019-01-01,1.33642
2019-01-02,1.634444
2019-01-03,2.482333
2019-01-04,2.060833
2019-01-05,2.452626


In [19]:
corona.head()

Unnamed: 0_level_0,cases,deaths,cases_smoothed,deaths_smoothed,cfr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,1.0,0.0,0.142857,0.0,0.0
2020-01-03,0.0,0.0,0.142857,0.0,0.0
2020-01-04,0.0,0.0,0.142857,0.0,0.0
2020-01-05,0.0,0.0,0.142857,0.0,0.0
2020-01-06,0.0,0.0,0.0,0.0,0.0


In [20]:
combined = corona.merge(weather, how="inner", left_index=True, right_index=True)

In [21]:
combined.head()

Unnamed: 0_level_0,cases,deaths,cases_smoothed,deaths_smoothed,cfr,air temperature
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,1.0,0.0,0.142857,0.0,0.0,2.820693
2020-01-03,0.0,0.0,0.142857,0.0,0.0,2.697867
2020-01-04,0.0,0.0,0.142857,0.0,0.0,3.165813
2020-01-05,0.0,0.0,0.142857,0.0,0.0,3.853173
2020-01-06,0.0,0.0,0.0,0.0,0.0,4.002133


In [22]:
combined.to_csv("../dat/SimpleCombinedData.csv")