In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  


In [2]:
chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome('chromedriver', options=chrome_options)

### Extract date



In [3]:
url = 'https://www.moh.gov.sg/covid-19'

driver.get(url)
content = driver.page_source


In [4]:
soup =BeautifulSoup(content, 'lxml')

In [5]:
data = soup.findAll(text=re.compile("Case Summary in Singapore \(as of .+"))


In [6]:
data

['Case Summary in Singapore (as of 25 Mar 2020, 1200h)']

In [7]:
m = re.search("(\d+ \w+ 202\d, \d+\d+h)", data[0])
date_time_str = m.group(1)
data_date_time = datetime.strptime(date_time_str, '%d %b %Y, %H%Mh').strftime('%Y-%m-%d')
data_date_time


'2020-03-25'

### Extract Dataframes

In [8]:
dfs = pd.read_html(content)

In [9]:
dfs

[    0   1
 0 NaN NaN
 1 NaN NaN,
     0   1
 0 NaN NaN
 1 NaN NaN,
                                               0  \
 0                                      IMPORTED   
 1                                     364 (+38)   
 2  Singapore Residents & Long Term Pass Holders   
 3                                     321 (+37)   
 
                                               1          2  
 0                                      IMPORTED   IMPORTED  
 1                                     364 (+38)  364 (+38)  
 2  Singapore Residents & Long Term Pass Holders   Visitors  
 3                                     321 (+37)    43 (+1)  ,
               0
 0  Active Cases
 1           404,
             0
 0  Discharged
 1         160,
                          0
 0  Discharge to Isolation*
 1                       65,
                        0
 0  Hospitalised (Stable)
 1                    387,
                          0
 0  Hospitalised (Critical)
 1                       17,
         0
 

In [41]:
active=3
assert dfs[active][0][0] == 'Active Cases'
dfs[active]


Unnamed: 0,0
0,Active Cases
1,404


In [42]:
discharge = 4
assert dfs[discharge][0][0] == 'Discharged'
dfs[discharge]

Unnamed: 0,0
0,Discharged
1,160


In [54]:
hospitalised_stable = 6
assert dfs[hospitalised_stable][0][0] == 'Hospitalised (Stable)'

dfs[hospitalised_stable]

Unnamed: 0,0
0,Hospitalised (Stable)
1,387


In [56]:
hospitalised_critical = 7
assert dfs[hospitalised_critical][0][0] == 'Hospitalised (Critical)'

dfs[hospitalised_critical]

Unnamed: 0,0
0,Hospitalised (Critical)
1,17


In [60]:
deaths = 8
assert dfs[deaths][0][0] == 'Deaths'

dfs[deaths]

Unnamed: 0,0
0,Deaths
1,2^


In [29]:
# # Look for df with confirmed cases, tested negative, ...

# for df in dfs:
#     combined_fields = df[3].to_string().lower()
#     if "active cases" in combined_fields:
#         sing_cases_df_temp = df
        
# sing_cases_df_temp

In [62]:
def remove_non_numeric(s):
    return re.sub('\D','', s)


In [63]:
# very hacky to workaround with the change from MoH site
sing_cases_df = pd.DataFrame({'active_cases': [int(remove_non_numeric(dfs[active][0][1]))],
                             'hospitalised_stable': [int(remove_non_numeric(dfs[hospitalised_stable][0][1]))],
                              'hospitalised_critical': [int(remove_non_numeric(dfs[hospitalised_critical][0][1]))],
                              'death': [int(remove_non_numeric(dfs[deaths][0][1]))],
                              'discharge': [int(dfs[discharge][0][1])],
                              'datetime': data_date_time
                             },
                            )
sing_cases_df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,discharge,datetime
0,404,387,17,2,160,2020-03-25


In [64]:
df = sing_cases_df
df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,discharge,datetime
0,404,387,17,2,160,2020-03-25


### Save to file

In [65]:
from pathlib import Path
csv_file = '../data/singapore-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.set_index('datetime').append(old_df.set_index('datetime'))



In [66]:
df = df.reset_index()
df.drop_duplicates(subset=['datetime'], inplace=True)

In [67]:
df.sort_values(by=['datetime'], inplace=True)
df

Unnamed: 0,datetime,active_cases,hospitalised_stable,hospitalised_critical,death,discharge,confirmed,discharged,hospitalised,negative,pending
1,2020-02-02,,,,,,18.0,,,240.0,43.0
2,2020-02-03,,,,,,24.0,,,262.0,32.0
3,2020-02-04,,,,,,24.0,,,289.0,20.0
4,2020-02-05,,,,,,28.0,,,295.0,62.0
5,2020-02-06,,,,,,33.0,,,310.0,147.0
6,2020-02-07,,,,,,33.0,,,363.0,181.0
7,2020-02-08,,,,,,40.0,,,438.0,181.0
8,2020-02-09,,,,,,45.0,7.0,,581.0,39.0
9,2020-02-10,,,,,,45.0,7.0,,581.0,39.0
10,2020-02-11,,,,,,47.0,9.0,,608.0,43.0


In [68]:
df.to_csv(csv_file, index=False)
