In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import unicodedata

In [2]:
datetime.now()

datetime.datetime(2020, 3, 15, 2, 9, 14, 436125)

In [3]:
url = 'https://www.health.gov.au/news/health-alerts/novel-coronavirus-2019-ncov-health-alert'
content = requests.get(url).content

In [4]:
soup = BeautifulSoup(content)
data = soup.findAll(text=re.compile('As .+')) + \
        soup.findAll(text=re.compile('.*(?:(Australian Capital Territory)|(Queensland)|(Victoria)|(New South Wales)|(South Australia)|(Western Australia)|(Northern Territory)|(Tasmania)|(Diamond Princess))')) 
data = ' '.join(data)
data

'As at 06:30\xa0hrs on 14\xa0March 2020, there were 197\xa0confirmed cases of coronavirus (COVID-19), including 3\xa0deaths, in Australia. As part of our ongoing strategy of containment and minimising risk to the Australian community, we are implementing additional screening of passengers at Australian airports. 1 in the Australian Capital Territory 91 in New South Wales 0 in the Northern Territory 35 in Queensland 16 in South Australia 4 in Tasmania 36 in Victoria 14 in Western Australia # Includes Diamond Princess repatriation cases: Qld (3), SA (1), Vic (4), WA (2, including 1 death). 10 cases, including 1 death, are associated with the Diamond Princess cruise ship repatriation flight from Japan'

In [5]:
cases_str = unicodedata.normalize("NFKD", data)

In [6]:
cases_str

'As at 06:30 hrs on 14 March 2020, there were 197 confirmed cases of coronavirus (COVID-19), including 3 deaths, in Australia. As part of our ongoing strategy of containment and minimising risk to the Australian community, we are implementing additional screening of passengers at Australian airports. 1 in the Australian Capital Territory 91 in New South Wales 0 in the Northern Territory 35 in Queensland 16 in South Australia 4 in Tasmania 36 in Victoria 14 in Western Australia # Includes Diamond Princess repatriation cases: Qld (3), SA (1), Vic (4), WA (2, including 1 death). 10 cases, including 1 death, are associated with the Diamond Princess cruise ship repatriation flight from Japan'

In [7]:
date_time_str = re.search('As .+ (\d+ \w+ 202\d)' ,cases_str).group(1)
date_time_str

'14 March 2020'

In [8]:
data_date_time = datetime.strptime(date_time_str, '%d %B %Y').strftime('%Y-%m-%d')
data_date_time

'2020-03-14'

In [9]:
m = re.search('there were (\d+) confirmed cases', cases_str)
australia = int(m.group(1))
australia

197

In [10]:
m = re.search('(\d+) in New South Wales', cases_str)
nsw = int(m.group(1))


In [11]:
m = re.search('(\d+) in Victoria', cases_str)
victoria = int(m.group(1))

In [12]:
m = re.search('(\d+) in South Australia', cases_str)
sa = int(m.group(1))

In [13]:
m = re.search('(\d+) in Queensland', cases_str)
qld = int(m.group(1))

In [14]:
m = re.search('(\d+) in Tasmania', cases_str)
tas = int(m.group(1))


In [15]:
m = re.search('(\d+) in .* Australian Capital Territory', cases_str)
act = int(m.group(1))



In [16]:
# m = re.search('(\d+) in Northern Territory', cases_str)
# nt = int(m.group(1))


In [17]:
m = re.search('(\d+) .* Diamond Princess', cases_str)
wa = int(m.group(1))


### Dataframe

In [18]:
df = pd.DataFrame({'date': data_date_time, 
                   'australia': australia, 
                   'nsw': nsw, 
                   'victoria': victoria, 
                   'sa': sa, 
                   'qld': qld, 
                   'wa': wa,
                   'act':act,
#                   'nt': nt,
                  'tas': tas }, index=[0])
# df = pd.DataFrame({'date': data_date_time, 'australia': australia}, index=[0])

In [19]:
df

Unnamed: 0,date,australia,nsw,victoria,sa,qld,wa,act,tas
0,2020-03-14,197,91,36,16,35,30,1,4


### Save to file

In [20]:
from pathlib import Path
csv_file = '../data/australia-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.append(old_df)
df.drop_duplicates(subset=['date'], inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [21]:
df.sort_values(by=['date'], inplace=True)
df.to_csv(csv_file, index=False)
df

Unnamed: 0,act,australia,date,nsw,nt,qld,sa,tas,victoria,wa
0,,12,2020-02-04,4.0,,2.0,2.0,,4.0,
1,,13,2020-02-05,4.0,,3.0,2.0,,4.0,
2,,14,2020-02-06,4.0,,4.0,2.0,,4.0,
3,,15,2020-02-07,4.0,,5.0,2.0,,4.0,
4,,15,2020-02-08,4.0,,5.0,2.0,,4.0,
5,,15,2020-02-09,4.0,,5.0,2.0,,4.0,
6,,15,2020-02-10,4.0,,5.0,2.0,,4.0,
7,,15,2020-02-11,4.0,,5.0,2.0,,4.0,
8,,15,2020-02-12,4.0,,5.0,2.0,,4.0,
9,,15,2020-02-13,4.0,,5.0,2.0,,4.0,
