# 埼玉県発表データの Web Scraping

Note: 2020/04/04 サイトで掲載されているデータがHTMLからpdfに変更になったため、このコードではデータは抽出できません

In [23]:
#export
import bs4
import re
import csv
import datetime
import requests

In [24]:
#export
URL = 'https://www.pref.saitama.lg.jp/a0701/covid19/jokyo.html'

In [25]:
#export
def get_src():
    response = requests.get(URL)
    response.encoding = response.apparent_encoding
    return bs4.BeautifulSoup(response.text, 'html.parser')

In [26]:
src = get_src()
#src

In [27]:
#export
def _get_table(src):
    tbody = src.find_all("tbody")[2]
    return tbody.find_all('tr')

In [28]:
table = _get_table(src)
#table

In [29]:
#export
def _header(table):
    return [th.text.strip() for th in table[0].find_all('th')]

In [30]:
col = _header(table)
col

['No.', '判明日', '年代', '性別', '居住地', '現状', '周囲の患者の発生', '濃厚接触者の状況']

In [31]:
#export
def _parse_no(td):
    if td.a:
        return td.a.get_text(), td.a.get('href')
    else:
        return td.text.strip(), 'NA'

In [32]:
'。'.encode('unicode-escape')

b'\\u3002'

In [33]:
#export
def _trim(td):
    return re.sub('\r\n\s+', '\u3002', re.sub('[\xa0\u3000]', '', td.text))

In [34]:
#export
def _parse(table, colum_num):
    patients = []
    for tr in table[1:]:
        tds = tr.find_all('td')
        no, link = _parse_no(tds[0])
        td = [no]
        for t in tds[1:]:
            td.append(_trim(t))
        for i in range(colum_num-len(td)):
            td.append('')
        td.append(link)
        patients.append(td)
    return patients

In [35]:
patients = _parse(table, len(col))
#patients

In [51]:
#export
def get_patients():
    src = get_src()
    table = _get_table(src)
    col = _header(table)
    patients = _parse(table, len(col))
    col.append('link')
    patients.append(col)
    patients.reverse()
    return patients

In [53]:
patients = get_patients()
#patients

In [38]:
#export
def create_fname(base):
    now = datetime.datetime.now()
    return base + '_' + now.strftime('%Y%m%dT%H%M') + ".csv"

In [39]:
create_fname("11saitama")

'11saitama_20200403T2035.csv'

In [40]:
#export
def write_csv(patients, fname):
    with open(fname, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(patients)

In [41]:
#write_csv(patients, create_fname("data/11saitama"))

In [46]:
#export
def main():
    patients = get_patients()
    write_csv(patients, create_fname("data/11saitama"))

In [54]:
#export
if __name__ == '__main__':
    main()

In [44]:
# See: https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py
!python notebook2script.py 11saitama_dev.ipynb

Converted 11saitama_dev.ipynb to exp/nb_11saitama.py
