In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import subprocess
import os

pd.set_option('display.max_rows', 500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [2]:
data_path = '../data/raw/time_series_covid19_confirmed_global.csv'
pd_raw = pd.read_csv(data_path)

In [3]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/14/22,5/15/22,5/16/22,5/17/22,5/18/22,5/19/22,5/20/22,5/21/22,5/22/22,5/23/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,179242,179267,179321,179328,179477,179597,179624,179674,179716,179716
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,275574,275615,275621,275688,275732,275732,275732,275838,275864,275881
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265816,265818,265823,265828,265834,265841,265847,265851,265854,265855
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,42156,42156,42156,42156,42572,42572,42572,42572,42572,42572
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99287,99287,99287,99287,99287,99287,99287,99287,99287,99287
5,,Antarctica,-71.9499,23.347,0,0,0,0,0,0,...,11,11,11,11,11,11,11,11,11,11
6,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,7721,7721,7795,7795,7855,7910,7910,7942,7942,7982
7,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,9101319,9135308,9135308,9135308,9135308,9135308,9135308,9135308,9178795,9178795
8,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,422900,422900,422917,422917,422917,422917,422917,422917,422917,422939
9,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,118728,119548,120638,121713,122718,123655,124477,125220,125828,126633


## Web Scraping

In [4]:
page = requests.get('https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html')

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
soup.get_text()

'\n\n\n\n\nRKI  -  Coronavirus SARS-CoV-2 - COVID-19: Fallzahlen in Deutschland und weltweit\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNavigation und Service\nSpringe direkt zu:\n\nInhalt\nHauptmenu\nSuche\n\n\n\n\n\n\n\nServicemenü\nKon\xadtaktIn\xadhaltHil\xadfeIm\xadpres\xads\xadumDa\xadten\xadschut\xadz\xader\xadklä\xadrungRSSEnglish\n\n\n\n\nErklärung zur Barrierefreiheit\n\n\nGebärdensprache\n\n\nLeichte Sprache\n\n\n\n\nSuche\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nKontakt\nNavigation\n\n\n\nZielgruppeneinstiege\n\n\n\n\nInfektionskrankheiten A-Z\n\n\n\nA\nB\nC\nD\nE\nF\nG\nH\nI\nJ\nK\nL\nM\nN\nO\nP\nQ\nR\nS\nT\nU\nV\nW\nX\nY\nZ\nalle\n\n\n\n\n\n\n\n\nGesundheit A-Z\n\n\n\nA\nB\nC\nD\nE\nF\nG\nH\nI\nJ\nK\nL\nM\nN\nO\nP\nQ\nR\nS\nT\nU\nV\nW\nX\nY\nZ\nalle\n\n\n\n\n\n\n\nNavigation\n\nIn\xadsti\xadtut\nGe\xadsund\xadheits\xadmo\xadni\xadto\xadring\nIn\xadfek\xadti\xadons\xadschutz\nForschung\nKom\xadmis\xadsio\xadnen\nSer\xadvice\n\n\n\n\n\n\nZiel

In [7]:
html_table = soup.find('table')

In [8]:
all_rows = html_table.find_all('tr')

In [9]:
final_data_list = []
for pos, rows in enumerate(all_rows):
    col_list = [each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [10]:
pd_daily_status = pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_in_7days',
                                                       4:'7_day_incidence',
                                                       5:'deaths'})
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_in_7days,7_day_incidence,deaths
2,Baden-Württem­berg,3.647.814,7.705,30.435,2741,16.055
3,Bayern,4.863.054,11.384,43.854,3337,23.979
4,Berlin,1.041.177,2.183,9.167,2502,4.604
5,Branden­burg,786.494,883.0,4.601,1818,5.654
6,Bremen,198.904,775.0,3.166,4655,773.0


## REST API calls

In [11]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronafälle_in_den_Bundesländern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [12]:
json_object = json.loads(data.content)

In [13]:
type(json_object)

dict

In [14]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [15]:
full_list = []
for pos,each_dict in enumerate(json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [16]:
pd_full_list = pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death,cases7_bl_per_100k,cases7_bl,death7_bl,cases7_bl_per_100k_txt,AdmUnitId
0,1,1,Schleswig-Holstein,Land,2910875,15,724402,1653343200000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,24886.056598,45737310000.0,2881496.0,2516,462.644394,13467,2,4626,1
1,2,2,Hamburg,Freie und Hansestadt,1852478,6,577907,1653343200000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,31196.429863,2089396000.0,418800.2,2643,332.905438,6167,1,3329,2
2,3,3,Niedersachsen,Land,8003421,9,2335583,1653343200000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,29182.308415,129983600000.0,4008988.0,9232,412.736004,33033,7,4127,3
3,4,4,Bremen,Freie Hansestadt,680130,5,198904,1653343200000,4,4132268b-54de-4327-ac1e-760e915112f1,29244.99728,1119157000.0,335717.7,773,465.499243,3166,2,4655,4
4,5,5,Nordrhein-Westfalen,Land,17925570,10,5230619,1653343200000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,29179.652307,87829360000.0,2648673.0,25297,319.855938,57336,6,3199,5


## API Access via REST service, e.g. USA data

In [20]:
# US for full list

response = requests.get('https://api.covidtracking.com/v2/us/daily.json')
print(response)

<Response [200]>


In [24]:
US_dict=json.loads(response.content) # imports string
US_dict.keys()

dict_keys(['links', 'meta', 'data'])

In [None]:
full_list = []
for pos,each_dict in enumerate(json_object['features'][:]):
    full_list.append(each_dict['attributes'])