This file is used for downloading and providing and overview of all the data needed for the analysis. The code is concentrated into functions for lucidity and better usablity. Same functions were then copied into the Module.py to be easily accesible from different files.

# To-do list

- download and process additional data from mzcr ? 
- merge the datasets

- plot visualizations 
- implement OOP

In [1]:
#importing required packages 
import requests
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from bs4 import BeautifulSoup

###### Processing the stringency data by json subscription to be then converting them to data frame

In [2]:
# API to obtain the stringency data https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/{YYYY-MM-DD}/{YYYY-MM-DD}

def get_stringency(start_date='2020-03-10',end_date='2021-08-18'):
    r = requests.get(f'https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/{start_date}/{end_date}')
    dt = r.json()
    data = {}
    for i in dt['data']:
        for j in dt['data'][i]:
            if dt['data'][i][j]['country_code'] == 'CZE':
                date = dt['data'][i][j]['date_value']
                data[date] = dt['data'][i][j]
                continue
    df = pd.DataFrame(data).T[['confirmed', 'deaths',
       'stringency_actual', 'stringency', 'stringency_legacy',
       'stringency_legacy_disp']]
    return df 

In [3]:
start_date= '2020-03-10'
end_date= '2021-08-18'

df = get_stringency(start_date,end_date) 

In [4]:
df

Unnamed: 0,confirmed,deaths,stringency_actual,stringency,stringency_legacy,stringency_legacy_disp
2020-03-10,41,0,25,25,33.57,33.57
2020-03-11,91,0,47.22,47.22,53.57,53.57
2020-03-12,94,0,50,50,53.57,53.57
2020-03-13,141,0,57.41,57.41,64.29,64.29
2020-03-14,189,0,61.11,61.11,67.86,67.86
...,...,...,...,...,...,...
2021-08-14,1676080,30373,37.04,37.04,48.81,48.81
2021-08-15,1676222,30373,,,,
2021-08-16,1676297,30375,,37.04,,48.81
2021-08-17,1676518,30376,,37.04,,48.81


In [5]:
df.describe()

Unnamed: 0,confirmed,deaths,stringency_actual,stringency,stringency_legacy,stringency_legacy_disp
count,527,527,523.0,526.0,523.0,526.0
unique,526,444,33.0,33.0,20.0,20.0
top,546833,0,73.15,73.15,50.0,50.0
freq,2,12,52.0,52.0,95.0,95.0


#### Scrapping on confirmed cases, deaths and hospitalization for Czech Republic from https://onemocneni-aktualne.mzcr.cz/covid-19

In [6]:
def get_confirmed():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/covid-19')
    soup = BeautifulSoup(r.text,'lxml')
    a = json.loads(soup.find('div', {'class':'visually-hidden', 'id' : 'js-total-persons-table-data'})['data-table'])
    df_conf = pd.DataFrame(a['body'], columns = ['Date','Confirmed']).set_index('Date')
    return df_conf

In [7]:
dt_conf = get_confirmed()
dt_conf

Unnamed: 0_level_0,Confirmed
Date,Unnamed: 1_level_1
01.03.2020,3
02.03.2020,0
03.03.2020,2
04.03.2020,1
05.03.2020,3
...,...
17.08.2021,298
18.08.2021,208
19.08.2021,166
20.08.2021,171


In [8]:
def get_deaths():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/covid-19')
    soup = BeautifulSoup(r.text,'lxml')
    a = json.loads(soup.find('div', {'class':'visually-hidden', 'id' : 'js-total-died-table-data'})['data-table'])
    df_deaths = pd.DataFrame(a['body'], columns = ['Date','Deaths', 'Deaths_cumulative']).set_index('Date')
    return df_deaths

In [9]:
df_deaths = get_deaths()
df_deaths

Unnamed: 0_level_0,Deaths,Deaths_cumulative
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
22.03.2020,1,1
23.03.2020,0,1
24.03.2020,2,3
25.03.2020,7,10
26.03.2020,0,10
...,...,...
17.08.2021,1,30382
18.08.2021,2,30384
19.08.2021,0,30384
20.08.2021,1,30385


In [10]:
def get_hospitalizations():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/covid-19')
    soup = BeautifulSoup(r.text,'lxml')
    a = json.loads(soup.find('div', {'class':'visually-hidden', 'id' : 'js-hospitalization-table-data'})['data-table'])
    df_hosp = pd.DataFrame(a['body'], columns = ['Date','Akt. hosp', 'V těžkém stavu', '% v těžkém stavu']).set_index('Date')
    return df_hosp

In [11]:
df_hosp = get_hospitalizations()
df_hosp

Unnamed: 0_level_0,Akt. hosp,V těžkém stavu,% v těžkém stavu
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01.03.2020,0,0,0.0000
02.03.2020,0,0,0.0000
03.03.2020,0,0,0.0000
04.03.2020,0,0,0.0000
05.03.2020,0,0,0.0000
...,...,...,...
17.08.2021,59,15,0.2542
18.08.2021,62,13,0.2097
19.08.2021,59,12,0.2034
20.08.2021,67,13,0.1940


#### Scapping data on indiviual regions 

In [12]:
kraje = ['PHA', 'STC','JHC','PLK', 'KVK','ULK', 'LBK','HKK', 'PAK', 'VYS', 'JHM', 'OLK', 'ZLK', 'MSK']
df_kraje = pd.DataFrame()
for i in kraje:
    r2 = requests.get(f'https://onemocneni-aktualne.mzcr.cz/covid-19/kraje/{i}')
    soup2 = BeautifulSoup(r2.text,'lxml')
    a = json.loads(soup2.find('div', {'class':'visually-hidden', 'id' : 'js-total-persons-table-data'})['data-table'])
    d = pd.DataFrame(a['body'], columns = ['Date',f'{i} Infected']).set_index('Date')
    df_kraje = pd.concat([df_kraje, d], axis =1)

### Scapping data on individual cases - with personal characterictic such as age, gender etc. from https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/osoby.json

- having  a memory issues - the kernell might need to be cleaned before obtaining the data, however this might be solved by importing this data to separate file or grouping the data

In [13]:
def get_individual():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/osoby.json')
    a = r.json()
    df = pd.DataFrame(a['data']).set_index('datum')
    return df

In [14]:
df = get_individual()


In [15]:
df

Unnamed: 0_level_0,vek,pohlavi,kraj_nuts_kod,okres_lau_kod,nakaza_v_zahranici,nakaza_zeme_csu_kod
datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-19,46.0,M,CZ080,CZ0805,False,
2020-12-31,69.0,Z,CZ020,CZ020B,False,
2020-11-04,41.0,M,CZ051,CZ0513,False,
2020-12-17,51.0,M,CZ051,CZ0513,False,
2020-12-16,3.0,M,CZ051,CZ0513,False,
...,...,...,...,...,...,...
2020-10-30,41.0,M,CZ072,CZ0722,False,
2021-02-09,35.0,M,CZ063,CZ0634,False,
2021-03-16,28.0,Z,CZ064,CZ0643,False,
2021-03-16,78.0,Z,CZ080,CZ0803,False,


In [16]:
def get_individual_deaths():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/umrti.json')
    a = r.json()
    df = pd.DataFrame(a['data']).set_index('datum')
    return df

In [17]:
df_deaths = get_individual_deaths()

In [19]:
df_deaths

Unnamed: 0_level_0,vek,pohlavi,kraj_nuts_kod,okres_lau_kod
datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-22,94,M,CZ010,CZ0100
2020-03-24,73,Z,CZ010,CZ0100
2020-03-24,44,M,CZ080,CZ0802
2020-03-25,91,Z,CZ010,CZ0100
2020-03-25,93,Z,CZ010,CZ0100
...,...,...,...,...
2021-08-06,73,M,CZ032,CZ0327
2021-08-06,49,M,CZ042,CZ0425
2021-08-09,81,Z,CZ063,CZ0632
2021-08-11,86,Z,CZ010,CZ0100


In [20]:
def get_individual_hospitalizations():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/hospitalizace.json')
    a = r.json()
    df = pd.DataFrame(a['data']).set_index('datum')
    return df

In [21]:
df_hospit =  get_individual_hospitalizations()

In [22]:
df_hospit.describe()

Unnamed: 0,pacient_prvni_zaznam,kum_pacient_prvni_zaznam,pocet_hosp,stav_bez_priznaku,stav_lehky,stav_stredni,stav_tezky,jip,kyslik,hfno,upv,ecmo,tezky_upv_ecmo,umrti,kum_umrti
count,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0,539.0
mean,251.569573,55950.51577,2486.949907,214.717996,686.751391,1138.495362,446.985158,455.41744,1031.890538,179.660482,225.897959,7.435993,448.289425,50.03525,10847.892393
std,318.183395,56244.032412,2996.005709,267.17775,813.5986,1416.943151,565.426664,551.926334,1277.937733,268.691296,277.043645,8.934972,566.004392,64.780532,11242.594604
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,1579.0,83.5,14.0,25.5,27.0,15.0,16.0,25.5,0.0,9.0,0.0,15.5,1.0,283.5
50%,38.0,38534.0,385.0,44.0,117.0,149.0,82.0,88.0,131.0,5.0,55.0,3.0,83.0,6.0,6763.0
75%,499.5,125611.5,5361.5,436.0,1514.5,2345.5,872.0,951.0,2282.0,281.0,447.5,15.0,875.0,105.5,24940.0
max,1176.0,135596.0,9475.0,966.0,2759.0,4748.0,2062.0,1892.0,4185.0,1054.0,973.0,36.0,2062.0,232.0,26969.0


In [23]:
def get_general_daily_stats():
    r = requests.get('https://onemocneni-aktualne.mzcr.cz/api/v2/covid-19/nakazeni-vyleceni-umrti-testy.json')
    a = r.json()
    df = pd.DataFrame(a['data']).set_index('datum')
    return df

In [24]:
df = get_general_daily_stats()
df

Unnamed: 0_level_0,kumulativni_pocet_nakazenych,kumulativni_pocet_vylecenych,kumulativni_pocet_umrti,kumulativni_pocet_testu,kumulativni_pocet_ag_testu,prirustkovy_pocet_nakazenych,prirustkovy_pocet_vylecenych,prirustkovy_pocet_umrti,prirustkovy_pocet_provedenych_testu,prirustkovy_pocet_provedenych_ag_testu
datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-27,0,0,0,20,0,0,0,0,20,0
2020-01-28,0,0,0,28,0,0,0,0,8,0
2020-01-29,0,0,0,33,0,0,0,0,5,0
2020-01-30,0,0,0,34,0,0,0,0,1,0
2020-01-31,0,0,0,37,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
2021-08-17,1676834,1645231,30382,9626725,25178586,298,203,1,33930,45333
2021-08-18,1677042,1645265,30384,9658652,25226019,208,34,2,31927,47433
2021-08-19,1677208,1645293,30384,9695282,25287716,166,28,0,36630,61697
2021-08-20,1677379,1645314,30385,9742440,25385179,171,21,1,47158,97463
