# References 

1. COVID-19 World Survey Data API https://covidmap.umd.edu/api.html
2. 공공데이터포털 끌어오기 https://greendreamtrre.tistory.com/268

In [1]:
#!pip install beautifulsoup4
#!pip install lxml

import warnings
warnings.filterwarnings(action='ignore')
import requests, bs4
import json
import pandas as pd 
import numpy as np 
import datetime as dt 
from lxml import html
import urllib 
from urllib.request import Request, urlopen
from urllib.parse import urlencode, quote_plus, unquote
import itertools 
#
from IPython.display import clear_output

# From json 

- 제이슨 파일에서 끌어오는 방식이다. 
- 각각 url별로 국가, 지역레벨 등등으로 나뉘어져 있다. 
- url에 옵션을 줘서 데이터를 끌어온다. 이것이 `get` 방식이다. 
- json을 pd.DataFrame으로 바꾸는 건 상대적으로 용이한 듯. 


In [2]:
def retrieve_df(url_level, country="South%Korea", region="all", date="20200506"):
    # request data from api
    url_api_0 = "https://covidmap.umd.edu/api/country" # get country-level 
    url_api_1 = "https://covidmap.umd.edu/api/region" # get region-level 
    url_api_2 = f"https://covidmap.umd.edu/api/datesavail?country={country}"
    url_api_3 = f"https://covidmap.umd.edu/api/resources?indicator=covid&type=daily&country={country}&region={region}&date={date}"
    this_api = eval(f'url_api_{url_level}')
    response = requests.get(this_api, verify=False).text
    jsonData = json.loads(response)
    return jsonData['data']
    #return pd.DataFrame.from_dict(jsonData['data'])

In [3]:
def clean_list(dirty_list): 
    tlist1 = list(itertools.chain(*dirty_list)) 
    return pd.DataFrame(tlist1)

In [4]:
def gen_fb_survey(dates): 
    
    dates_len = len(dates)
    res = [] 
    q = 0
    
    for date in dates:        
        q += 1
        list_appd = retrieve_df(url_level=3, date=date)         
        res.append(list_appd)
        print(f"Write df for {date}, and {q} of {dates_len}")
        clear_output(wait=True)
    
    return clean_list(res) 

In [5]:
dates =[k['survey_date'] for k in retrieve_df(2)] 
df_fb = gen_fb_survey(dates)     

Write df for 20201012, and 172 of 172


In [6]:
#df
df_fb_seoul = df_fb.query('`region` == "Seoul"')

In [7]:
df_fb_seoul

Unnamed: 0,cli_se,cli_se_unw,country,gid_0,gid_1,iso_code,percent_cli,percent_cli_unw,region,sample_size,survey_date
1,0.007249,0.004662,South Korea,KOR,KOR.16_1,KOR,0.004424,0.004673,Seoul,214.0,20200503
3,0.005475,0.005218,South Korea,KOR,KOR.16_1,KOR,0.003877,0.007407,Seoul,270.0,20200504
5,0.004213,0.004962,South Korea,KOR,KOR.16_1,KOR,0.002137,0.007042,Seoul,284.0,20200505
7,0.009735,0.007092,South Korea,KOR,KOR.16_1,KOR,0.010908,0.014286,Seoul,280.0,20200506
9,0.008141,0.007067,South Korea,KOR,KOR.16_1,KOR,0.008095,0.014235,Seoul,281.0,20200507
...,...,...,...,...,...,...,...,...,...,...,...
313,0.000000,0.000000,South Korea,KOR,KOR.16_1,KOR,0.000000,0.000000,Seoul,112.0,20201005
315,0.009917,0.008583,South Korea,KOR,KOR.16_1,KOR,0.005394,0.008621,Seoul,116.0,20201006
317,0.009835,0.008439,South Korea,KOR,KOR.16_1,KOR,0.004660,0.008475,Seoul,118.0,20201007
318,0.015772,0.012296,South Korea,KOR,KOR.16_1,KOR,0.014884,0.017544,Seoul,114.0,20201009


# 서울시 api에서 끌어오기 

- 일단 친절하지 않다. 
- 방식은 페이지 넘버를 통해 데이터를 끌어온다. 
- 역시 제이슨이기 때문에 쉽게 바꿀 수 있다. 

In [8]:
def make_datestring(input_str):
    
    res1 = input_str.rjust(6, '0')
    return f"2020.{res1}".replace(".", "")

def generate_covid_seoul(start_page):
    # url 변수에 최종 완성본 url을 넣자
    end_page = start_page + 999
    url = f"http://openapi.seoul.go.kr:8088/7067764353616e6137394f68524844/json/Corona19Status/{start_page}/{end_page}"
     # url을 불러오고 이것을 인코딩을 utf-8로 전환하여 결과를 받자.
    response = urllib.request.urlopen(url) 
    json_str = response.read().decode("utf-8")
    # 받은 데이터가 문자열이라서 이를 json으로 변환한다.
    json_object = json.loads(json_str)
    json_to_dict = json_object['Corona19Status']['row']
    tdf = pd.DataFrame.from_dict(json_to_dict)
    tdf['date_reported'] = tdf['CORONA19_DATE'].apply(make_datestring)
    
    #return url 
    return tdf.groupby(['date_reported']).size().to_frame(name = 'count').reset_index()

In [9]:
df_covid_seoul = pd.DataFrame() 

for i in range(1, 6001, 1000): 
    tdfa = generate_covid_seoul(i)
    df_covid_seoul = pd.concat([df_covid_seoul, tdfa])

In [10]:
df = pd.merge(df_fb_seoul, df_covid_seoul, how='left', left_on='survey_date', right_on='date_reported') 

In [53]:
df_gc = df.query('`date_reported` == `date_reported`')
#df_gc.columns
#df_gc['cli_raw'] = df_gc['sample_size'] * df_gc['percent_cli_unw']

# xml에서 끌어오기 

- 살짝 복잡하지만 그렇게 어렵지는 않다. 
- 역시 get방식이다. 
- query parameter를 정형화해두어서 보기에 편하다. 
- 나머지는 다소 기계적으로 수행이 가능하다. 

In [8]:
# 1. URL 파라미터 분리하기.
# Service URL
xmlUrl = 'http://openapi.data.go.kr/openapi/service/rest/Covid19/getCovid19SidoInfStateJson'
My_API_Key = unquote('M3jBsz%2FCY6CB9tMYmAf9SQ8AMW6AtjQDjshDbBlpsTadmBOfyBTVoYSqAvMVX5HH2GNgM%2FzOHv150PFIyQbkig%3D%3D')    # 아래 내가 받은 인증키가 안 되서 수업용 인증키 사용.
# My_API_Key = unquote('Agq7hySmyMi1FFU9kYibP%2BEnxYepQ%2FB6Dn%2Bw9lsYKVSCDjTwIdvpjmuhJrtyQrhipg3F3a4jbSq%2FLxHi%2FdUIoQ%3D%3D')    # 사용자 인증키
queryParams = '?' + urlencode(    # get 방식으로 쿼리를 분리하기 위해 '?'를 넣은 것이다. 메타코드 아님.
    {
        quote_plus('ServiceKey') : My_API_Key,    # 필수 항목 1 : 서비스키 (본인의 서비스키)
        quote_plus('startCreateDt') : '20200101',          # 필수 항목 2 : 지역코드 (법정코드목록조회에서 확인)
        quote_plus('endCreateDt') :   '20201015'         # 픽수 항목 3 : 계약월
     }
)

response = requests.get(xmlUrl + queryParams).text.encode('utf-8')
xmlobj = bs4.BeautifulSoup(response, "html.parser")

In [9]:
rows = xmlobj.findAll('item')
rowList = []
nameList = []
columnList = []

rowsLen = len(rows)
for i in range(0, rowsLen):
    columns = rows[i].find_all()
    
    columnsLen = len(columns)
    for j in range(0, columnsLen):
        # 첫 번째 행 데이터 값 수집 시에만 컬럼 값을 저장한다. (어차피 rows[0], rows[1], ... 모두 컬럼헤더는 동일한 값을 가지기 때문에 매번 반복할 필요가 없다.)
        if i == 0:
            nameList.append(columns[j].name)
        # 컬럼값은 모든 행의 값을 저장해야한다.    
        eachColumn = columns[j].text
        columnList.append(eachColumn)
    rowList.append(columnList)
    columnList = []    # 다음 row의 값을 넣기 위해 비워준다. (매우 중요!!)
    
result = pd.DataFrame(rowList, columns=nameList)
result.head()

Unnamed: 0,createdt,deathcnt,defcnt,gubun,gubuncn,gubunen,incdec,isolclearcnt,isolingcnt,localocccnt,overflowcnt,qurrate,seq,stdday,updatedt
0,2020-10-15 09:41:57.298,0,1616,검역,隔離區,Lazaretto,6,1424,192,0,6,-,5042,2020년 10월 15일 00시,
1,2020-10-15 09:41:57.298,0,59,제주,济州,Jeju,0,59,0,0,0,8.80,5041,2020년 10월 15일 00시,
2,2020-10-15 09:41:57.298,0,297,경남,庆南,Gyeongsangnam-do,0,286,11,0,0,8.84,5040,2020년 10월 15일 00시,
3,2020-10-15 09:41:57.298,56,1571,경북,庆北,Gyeongsangbuk-do,1,1497,18,0,1,59.00,5039,2020년 10월 15일 00시,
4,2020-10-15 09:41:57.298,2,176,전남,全南,Jeollanam-do,0,166,8,0,0,9.44,5038,2020년 10월 15일 00시,


In [12]:
df_covid_seoul = result.query('gubun == "서울"')[['createdt', 'incdec']]

In [30]:
df_covid_seoul['date'] = pd.to_datetime(df_covid_seoul['createdt'])

In [46]:
df_covid_seoul['date'] = df_covid_seoul['createdt'].apply(lambda x: str(x).split(" ")[0].replace("-", ""))

In [47]:
df_covid_seoul

Unnamed: 0,createdt,incdec,date
17,2020-10-15 09:41:57.296,25,20201015
36,2020-10-14 09:38:25.145,23,20201014
55,2020-10-13 09:39:01.485,20,20201013
74,2020-10-12 15:06:20.577,31,20201012
93,2020-10-12 09:40:31.378,31,20201012
...,...,...,...
4303,2020-03-08 14:56:02.02,12,20200308
4321,2020-03-07 15:29:59.59,3,20200307
4339,2020-03-06 15:09:04.04,2,20200306
4357,2020-03-05 15:29:39.39,4,20200305


In [48]:
df_fb_seoul

Unnamed: 0,cli_se,cli_se_unw,country,gid_0,gid_1,iso_code,percent_cli,percent_cli_unw,region,sample_size,survey_date
1,0.007249,0.004662,South Korea,KOR,KOR.16_1,KOR,0.004424,0.004673,Seoul,214.0,20200503
3,0.005475,0.005218,South Korea,KOR,KOR.16_1,KOR,0.003877,0.007407,Seoul,270.0,20200504
5,0.004213,0.004962,South Korea,KOR,KOR.16_1,KOR,0.002137,0.007042,Seoul,284.0,20200505
7,0.009735,0.007092,South Korea,KOR,KOR.16_1,KOR,0.010908,0.014286,Seoul,280.0,20200506
9,0.008141,0.007067,South Korea,KOR,KOR.16_1,KOR,0.008095,0.014235,Seoul,281.0,20200507
...,...,...,...,...,...,...,...,...,...,...,...
313,0.000000,0.000000,South Korea,KOR,KOR.16_1,KOR,0.000000,0.000000,Seoul,112.0,20201005
315,0.009917,0.008583,South Korea,KOR,KOR.16_1,KOR,0.005394,0.008621,Seoul,116.0,20201006
317,0.009835,0.008439,South Korea,KOR,KOR.16_1,KOR,0.004660,0.008475,Seoul,118.0,20201007
318,0.015772,0.012296,South Korea,KOR,KOR.16_1,KOR,0.014884,0.017544,Seoul,114.0,20201009


In [67]:
df = df_fb_seoul[['survey_date','percent_cli']].merge(df_covid_seoul[['date', 'incdec']], how='left', left_on="survey_date", right_on='date')
df['incdec'] = df['incdec'].apply(lambda x: int(x))

In [54]:
>>> import statsmodels.api as sm
>>> from statsmodels.tsa.stattools import grangercausalitytests
>>> import numpy as np

In [105]:
data = df[['incdec', 'percent_cli', 'date']]
#data['d_percent_cli'] = df['percent_cli'].diff()
#data['pct_incdec'] = df['incdec'].pct_change()
#data['pct_incdec'] = data['pct_incdec'].apply(lambda x: np.nan_to_num(x))
#data = data.applymap(lambda x: np.nan_to_num(x))
#gdata = data.loc[1:,['pct_incdec','d_percent_cli']]
gdata = data[['percent_cli', 'incdec']]

In [106]:
gc_res = grangercausalitytests(gdata, 7)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.3502  , p=0.5549  , df_denom=161, df_num=1
ssr based chi2 test:   chi2=0.3567  , p=0.5504  , df=1
likelihood ratio test: chi2=0.3563  , p=0.5506  , df=1
parameter F test:         F=0.3502  , p=0.5549  , df_denom=161, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.3285  , p=0.7205  , df_denom=158, df_num=2
ssr based chi2 test:   chi2=0.6777  , p=0.7126  , df=2
likelihood ratio test: chi2=0.6763  , p=0.7131  , df=2
parameter F test:         F=0.3285  , p=0.7205  , df_denom=158, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.2089  , p=0.8901  , df_denom=155, df_num=3
ssr based chi2 test:   chi2=0.6549  , p=0.8838  , df=3
likelihood ratio test: chi2=0.6536  , p=0.8841  , df=3
parameter F test:         F=0.2089  , p=0.8901  , df_denom=155, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1687  , p=0.9540  