In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  


In [2]:
chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome('chromedriver', options=chrome_options)

### Extract date



In [3]:
url = 'https://www.moh.gov.sg/covid-19'

driver.get(url)
content = driver.page_source


In [4]:
soup =BeautifulSoup(content, 'lxml')

In [5]:
data = soup.findAll(text=re.compile("Case Summary in Singapore \(as of .+"))


In [6]:
data

['Case Summary in Singapore (as of 17 Mar 2020, 1200h)']

In [7]:
m = re.search("(\d+ \w+ 202\d, \d+\d+h)", data[0])
date_time_str = m.group(1)
data_date_time = datetime.strptime(date_time_str, '%d %b %Y, %H%Mh').strftime('%Y-%m-%d')
data_date_time


'2020-03-17'

### Extract Dataframes

In [8]:
dfs = pd.read_html(content)

In [9]:
dfs

[    0   1
 0 NaN NaN
 1 NaN NaN,
     0   1
 0 NaN NaN
 1 NaN NaN,
                        0                        1
 0           ACTIVE CASES             ACTIVE CASES
 1                    152                      152
 2  Hospitalised (Stable)  Hospitalised (Critical)
 3                    138                       14
 4                  Death               Discharged
 5                      0                      114,
                0       1
 0  DORSCON Level  Orange,
               0                                                  1
 0          Date                                              Title
 1   18 Mar 2020  [Updated] MOH Health Advisory for Persons Issu...
 2   17 Mar 2020  Five More Cases Discharged; 23 New Cases of CO...
 3   17 Mar 2020  Speech by Mr Lawrence Wong, Minister for Natio...
 4   17 Mar 2020  Advisory for Singaporean Students Studying Ove...
 5   17 Mar 2020  Accommodating Workers Affected by Lockdown in ...
 6   16 Mar 2020  Health Clearance Requiremen

In [10]:
# Look for df with confirmed cases, tested negative, ...

for df in dfs:
    combined_fields = df[1].to_string().lower()
    if "active cases" in combined_fields and "discharged" in combined_fields:
        sing_cases_df_temp = df
        
sing_cases_df_temp

Unnamed: 0,0,1
0,ACTIVE CASES,ACTIVE CASES
1,152,152
2,Hospitalised (Stable),Hospitalised (Critical)
3,138,14
4,Death,Discharged
5,0,114


In [11]:
# very hacky to workaround with the change from MoH site
sing_cases_df = pd.DataFrame({'active_cases': [int(sing_cases_df_temp[0][1])],
                             'hospitalised_stable': [int(sing_cases_df_temp[0][3])],
                              'hospitalised_critical': [int(sing_cases_df_temp[1][3])],
                              'death': [int(sing_cases_df_temp[0][5])],
                              'discharge': [int(sing_cases_df_temp[1][5])],
                              'datetime': data_date_time
                             },
                            )
sing_cases_df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,discharge,datetime
0,152,138,14,0,114,2020-03-17


In [12]:
df = sing_cases_df
df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,discharge,datetime
0,152,138,14,0,114,2020-03-17


### Save to file

In [13]:
from pathlib import Path
csv_file = '../data/singapore-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.set_index('datetime').append(old_df.set_index('datetime'))



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [14]:
df = df.reset_index()
df.drop_duplicates(subset=['datetime'], inplace=True)

In [15]:
df.sort_values(by=['datetime'], inplace=True)
df

Unnamed: 0,datetime,active_cases,confirmed,death,discharge,discharged,hospitalised,hospitalised_critical,hospitalised_stable,negative,pending
1,2020-02-02,,18.0,,,,,,,240.0,43.0
2,2020-02-03,,24.0,,,,,,,262.0,32.0
3,2020-02-04,,24.0,,,,,,,289.0,20.0
4,2020-02-05,,28.0,,,,,,,295.0,62.0
5,2020-02-06,,33.0,,,,,,,310.0,147.0
6,2020-02-07,,33.0,,,,,,,363.0,181.0
7,2020-02-08,,40.0,,,,,,,438.0,181.0
8,2020-02-09,,45.0,,,7.0,,,,581.0,39.0
9,2020-02-10,,45.0,,,7.0,,,,581.0,39.0
10,2020-02-11,,47.0,,,9.0,,,,608.0,43.0


In [16]:
df.to_csv(csv_file, index=False)
