In [1]:
import os
import subprocess
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

### John Hopkins (GITHUB)

In [2]:
git_pull = subprocess.Popen( "/usr/bin/git pull" , 
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b'From https://github.com/CSSEGISandData/COVID-19\n   f692c88be..8d58f35c6  web-data   -> origin/web-data\n'
out : b'Already up to date.\n'


In [3]:
data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df_raw = pd.read_csv(data_path)

In [4]:
df_raw[:100]

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20,12/8/20
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,46116,46274,46516,46718,46837,46837,47072,47306,47516,47716
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,37625,38182,39014,39719,40501,41302,42148,42988,43683,44436
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,82221,83199,84152,85084,85927,86730,87502,88252,88825,89416
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,6712,6745,6790,6842,6904,6955,7005,7050,7084,7127
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,15103,15139,15251,15319,15361,15493,15536,15591,15648,15729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,Costa Rica,9.748900,-83.753400,0,0,0,0,0,0,...,137093,139638,140172,141340,142505,143685,143685,143685,146421,147430
96,,Cote d'Ivoire,7.540000,-5.547100,0,0,0,0,0,0,...,21310,21331,21334,21361,21389,21412,21441,21485,21507,21513
97,,Croatia,45.100000,15.200000,0,0,0,0,0,0,...,126612,128442,131342,134881,139415,143370,147454,150353,152239,154852
98,,Cuba,21.521757,-77.781167,0,0,0,0,0,0,...,8233,8284,8381,8456,8531,8610,8714,8782,8906,8982


### RKI, webscrape (webscraping)

In [5]:
page = requests.get('https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html')

In [6]:
soup = BeautifulSoup(page.content, 'html.parser')

In [7]:
html_table = soup.find('table')

In [8]:
all_rows = html_table.find_all('tr')

In [9]:
data_list = []
for pos, rows in enumerate(all_rows):
    col_list = [col.get_text(strip = True) for col in rows.find_all('td')]
    data_list.append(col_list)

pd.DataFrame(data_list).dropna().rename(columns = {0:'state'})

Unnamed: 0,state,1,2,3,4,5
2,Baden-Württem­berg,170.904,2.639,17.03,153,3.17
3,Bayern,238.442,3.828,23.485,179,4.518
4,Berlin,73.430,1.348,6.471,176,732.0
5,Branden­burg,23.373,632.0,2.746,109,435.0
6,Bremen,10.973,114.0,734.0,108,142.0
7,Hamburg,27.363,282.0,1.884,102,435.0
8,Hessen,99.560,1.661,9.89,157,1.578
9,Meck­lenburg-Vor­pommern,7.058,223.0,919.0,57,86.0
10,Nieder­sachsen,79.187,891.0,5.931,74,1.299
11,Nord­rhein-West­falen,294.740,4.072,26.478,148,4.167


### REST API calls

In [10]:
data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_COVID19/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json') 

In [11]:
json_obj = json.loads(data.content)

In [12]:
type(json_obj)

dict

In [13]:
json_obj.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'fields', 'exceededTransferLimit', 'features'])

In [14]:
full_list = []
for pos, each_dict in enumerate(json_obj['features'][:]):
             full_list.append(each_dict['attributes'])               

In [15]:
pd.DataFrame(full_list)

Unnamed: 0,IdBundesland,Bundesland,Landkreis,Altersgruppe,Geschlecht,AnzahlFall,AnzahlTodesfall,ObjectId,Meldedatum,IdLandkreis,Datenstand,NeuerFall,NeuerTodesfall,Refdatum,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn,Altersgruppe2
0,5,Nordrhein-Westfalen,SK Bottrop,A35-A59,W,1,0,158640,1577923200000,05512,"10.12.2020, 00:00 Uhr",0,-9,1603843200000,0,1,1,Nicht übermittelt
1,8,Baden-Württemberg,LK Karlsruhe,A35-A59,W,1,0,350476,1578009600000,08215,"10.12.2020, 00:00 Uhr",0,-9,1578009600000,0,1,0,Nicht übermittelt
2,5,Nordrhein-Westfalen,SK Oberhausen,A35-A59,W,1,0,102406,1578787200000,05119,"10.12.2020, 00:00 Uhr",0,-9,1578787200000,0,1,0,Nicht übermittelt
3,12,Brandenburg,LK Oberhavel,A15-A34,M,1,0,602064,1579305600000,12065,"10.12.2020, 00:00 Uhr",0,-9,1605052800000,0,1,1,Nicht übermittelt
4,5,Nordrhein-Westfalen,SK Köln,A35-A59,W,1,0,128142,1580169600000,05315,"10.12.2020, 00:00 Uhr",0,-9,1580169600000,0,1,0,Nicht übermittelt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,8,Baden-Württemberg,LK Esslingen,A35-A59,M,6,0,322227,1584144000000,08116,"10.12.2020, 00:00 Uhr",0,-9,1583884800000,0,6,1,Nicht übermittelt
4996,8,Baden-Württemberg,LK Esslingen,A35-A59,M,1,0,322228,1584144000000,08116,"10.12.2020, 00:00 Uhr",0,-9,1583971200000,0,1,1,Nicht übermittelt
4997,8,Baden-Württemberg,LK Esslingen,A35-A59,M,1,0,322230,1584144000000,08116,"10.12.2020, 00:00 Uhr",0,-9,1584144000000,0,1,0,Nicht übermittelt
4998,8,Baden-Württemberg,LK Esslingen,A60-A79,M,1,0,322623,1584144000000,08116,"10.12.2020, 00:00 Uhr",0,-9,1583798400000,0,1,1,Nicht übermittelt


### API access via REST service - US data

https://smartable.ai

In [16]:
url = "https://coronavirus-smartable.p.rapidapi.com/stats/v1/US/"

headers = {
    'x-rapidapi-key': "6e6c09cf63msh0fe1576d2be793ap13f76djsnc53a1379d6c5",
    'x-rapidapi-host': "coronavirus-smartable.p.rapidapi.com"
    }

response = requests.request("GET", url, headers=headers)

print(response.text)

{
  "location": {
    "long": -95.712891,
    "countryOrRegion": "United States",
    "provinceOrState": null,
    "county": null,
    "isoCode": "US",
    "lat": 37.09024
  },
  "updatedDateTime": "2020-12-10T05:00:43.2640411Z",
  "stats": {
    "totalConfirmedCases": 15244174,
    "newlyConfirmedCases": 11123,
    "totalDeaths": 297395,
    "newDeaths": 251,
    "totalRecoveredCases": 2796278,
    "newlyRecoveredCases": 0,
    "history": [
      {
        "date": "2020-01-22T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-23T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-24T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-25T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-26T00:00:00",
        "co

In [17]:
US_dict = json.loads(response.content)

with open('../data/raw/SMARTABLE/US_data.txt', 'w') as outfile:
    json.dump(US_dict, outfile, indent = 2)

# Business Understanding

track coronavirus spread worldwide and with personal local info

## Goals

1. understanding the data quality
2. automation as much as possible:
    how many clicks do we need to excecute the full pipeline

## Constraint

each notebook should be left clean and ready for fully execution