In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  


In [2]:
chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome('chromedriver', options=chrome_options)

### Extract date



In [3]:
url = 'https://www.moh.gov.sg/covid-19'

driver.get(url)
content = driver.page_source


In [4]:
soup =BeautifulSoup(content, 'lxml')

In [5]:
data = soup.findAll(text=re.compile("Case Summary in Singapore \(as of .+"))


In [6]:
data

['Case Summary in Singapore (as of 1 May 2020, 1200h)']

In [7]:
m = re.search("(\d+ \w+ 202\d, \d+\d+h)", data[0])
date_time_str = m.group(1)
data_date_time = datetime.strptime(date_time_str, '%d %b %Y, %H%Mh').strftime('%Y-%m-%d')
data_date_time


'2020-05-01'

### Extract Dataframes

In [8]:
dfs = pd.read_html(content)

In [9]:
dfs

[    0   1
 0 NaN NaN
 1 NaN NaN,
     0   1
 0 NaN NaN
 1 NaN NaN,
                0       1
 0  DORSCON Level  Orange,
           0         1         2
 0  IMPORTED  IMPORTED  IMPORTED
 1  571 (+0)  571 (+0)  571 (+0),
                0
 0  Active Cases#
 1           1764,
                0
 0  Active Cases#
 1           1764,
                           0
 0  In Isolation Facilities*
 1                     14053,
                        0
 0  Hospitalised (Stable)
 1                   1741,
                          0
 0  Hospitalised (Critical)
 1                       23,
         0
 0  Deaths
 1      16,
         0
 0  143919,
        0
 0  99929,
          0
 0  ~25,200,
          0
 0  ~17,500,
               0                                                  1
 0          Date                                              Title
 1   02 May 2020  Additional COVID-19 Support Measures for all H...
 2   02 May 2020  Home-Based Food Businesses Allowed to Resume O...
 3   02 May 2020 

In [17]:
def find_idx(starts_with):
    print('looking for idx that starts with', starts_with)
    df_idx = 0
    for idx in range(len(dfs)):
        if starts_with in str(dfs[idx][0][0]):
            print("found starting df")
            df_idx = idx
            break
    print(df_idx)
    assert df_idx < len(dfs)    
    return df_idx


In [18]:
active=find_idx('Active Cases')
dfs[active]


looking for idx that starts with Active Cases
found starting df
4


Unnamed: 0,0
0,Active Cases#
1,1764


In [21]:
# discharge = find_idx('Discharged')
# dfs[discharge]

In [22]:
hospitalised_stable = find_idx('Hospitalised (Stable)')

dfs[hospitalised_stable]

looking for idx that starts with Hospitalised (Stable)
found starting df
7


Unnamed: 0,0
0,Hospitalised (Stable)
1,1741


In [23]:
hospitalised_critical = find_idx('Hospitalised (Critical)')
dfs[hospitalised_critical]

looking for idx that starts with Hospitalised (Critical)
found starting df
8


Unnamed: 0,0
0,Hospitalised (Critical)
1,23


In [24]:
deaths = find_idx('Deaths')

dfs[deaths]

looking for idx that starts with Deaths
found starting df
9


Unnamed: 0,0
0,Deaths
1,16


In [17]:
# # Look for df with confirmed cases, tested negative, ...

# for df in dfs:
#     combined_fields = df[3].to_string().lower()
#     if "active cases" in combined_fields:
#         sing_cases_df_temp = df
        
# sing_cases_df_temp

In [26]:
def remove_non_numeric(s):
    return re.sub('\D','', s)


In [27]:
# very hacky to workaround with the change from MoH site
sing_cases_df = pd.DataFrame({'active_cases': [int(remove_non_numeric(dfs[active][0][1]))],
                             'hospitalised_stable': [int(remove_non_numeric(dfs[hospitalised_stable][0][1]))],
                              'hospitalised_critical': [int(remove_non_numeric(dfs[hospitalised_critical][0][1]))],
                              'death': [int(remove_non_numeric(dfs[deaths][0][1]))],
#                               'discharge': [int(dfs[discharge][0][1])],
                              'datetime': data_date_time
                             },
                            )
sing_cases_df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,datetime
0,1764,1741,23,16,2020-05-01


In [28]:
df = sing_cases_df
df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,datetime
0,1764,1741,23,16,2020-05-01


### Save to file

In [29]:
from pathlib import Path
csv_file = '../data/singapore-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.set_index('datetime').append(old_df.set_index('datetime'))



In [30]:
df = df.reset_index()
df.drop_duplicates(subset=['datetime'], inplace=True)

In [31]:
df.sort_values(by=['datetime'], inplace=True)
df

Unnamed: 0,datetime,active_cases,hospitalised_stable,hospitalised_critical,death,confirmed,discharge,discharged,hospitalised,negative,pending
1,2020-02-02,,,,,18.0,,,,240.0,43.0
2,2020-02-03,,,,,24.0,,,,262.0,32.0
3,2020-02-04,,,,,24.0,,,,289.0,20.0
4,2020-02-05,,,,,28.0,,,,295.0,62.0
5,2020-02-06,,,,,33.0,,,,310.0,147.0
...,...,...,...,...,...,...,...,...,...,...,...
86,2020-04-27,1451.0,1431.0,20.0,142.0,,1095.0,,,,
87,2020-04-28,1689.0,1668.0,21.0,14.0,,1128.0,,,,
88,2020-04-29,1714.0,1692.0,22.0,14.0,,1188.0,,,,
89,2020-04-30,1708.0,1686.0,22.0,15.0,,1244.0,,,,


In [24]:
df.to_csv(csv_file, index=False)
