In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  


In [2]:
chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome('chromedriver', options=chrome_options)

### Extract date



In [3]:
url = 'https://www.moh.gov.sg/covid-19'

driver.get(url)
content = driver.page_source


In [4]:
soup =BeautifulSoup(content, 'lxml')

In [5]:
data = soup.findAll(text=re.compile("Case Summary in Singapore \(as of .+"))


In [6]:
data

['Case Summary in Singapore (as of 24 Jun 2020, 1200h)']

In [7]:
m = re.search("(\d+ \w+ 202\d, \d+\d+h)", data[0])
date_time_str = m.group(1)
data_date_time = datetime.strptime(date_time_str, '%d %b %Y, %H%Mh').strftime('%Y-%m-%d')
data_date_time


'2020-06-24'

### Extract Dataframes

In [8]:
dfs = pd.read_html(content)

In [9]:
dfs

[    0   1
 0 NaN NaN
 1 NaN NaN,
     0   1
 0 NaN NaN
 1 NaN NaN,
                0       1
 0  DORSCON Level  Orange,
             0           1           2
 0  IMPORTED**  IMPORTED**  IMPORTED**
 1    581 (+0)    581 (+0)    581 (+0),
                0
 0  Active Cases#
 1           6298,
             0
 0  Discharged
 1       36299,
                           0
 0  In Community Facilities*
 1                      6109,
                        0
 0  Hospitalised (Stable)
 1                    188,
                          0
 0  Hospitalised (Critical)
 1                        1,
          0
 0  Deaths^
 1       26,
         0
 0  684359,
         0
 0  376749,
           0
 0  ~120,100,
          0
 0  ~66,100,
               0                                                  1
 0          Date                                              Title
 1   24 Jun 2020  304 More Cases Discharged, 191 New Cases of CO...
 2   24 Jun 2020  Contact–free Temperature Self-check Kiosks to ...
 

In [10]:
def find_idx(starts_with):
    print('looking for idx that starts with', starts_with)
    df_idx = 0
    for idx in range(len(dfs)):
        if starts_with in str(dfs[idx][0][0]):
            print("found starting df")
            df_idx = idx
            break
    print(df_idx)
    assert df_idx < len(dfs)    
    return df_idx


In [11]:
active=find_idx('Active Cases')
dfs[active]


looking for idx that starts with Active Cases
found starting df
4


Unnamed: 0,0
0,Active Cases#
1,6298


In [12]:
# discharge = find_idx('Discharged')
# dfs[discharge]

In [13]:
hospitalised_stable = find_idx('Hospitalised (Stable)')

dfs[hospitalised_stable]

looking for idx that starts with Hospitalised (Stable)
found starting df
7


Unnamed: 0,0
0,Hospitalised (Stable)
1,188


In [14]:
hospitalised_critical = find_idx('Hospitalised (Critical)')
dfs[hospitalised_critical]

looking for idx that starts with Hospitalised (Critical)
found starting df
8


Unnamed: 0,0
0,Hospitalised (Critical)
1,1


In [15]:
deaths = find_idx('Deaths')

dfs[deaths]

looking for idx that starts with Deaths
found starting df
9


Unnamed: 0,0
0,Deaths^
1,26


In [16]:
# # Look for df with confirmed cases, tested negative, ...

# for df in dfs:
#     combined_fields = df[3].to_string().lower()
#     if "active cases" in combined_fields:
#         sing_cases_df_temp = df
        
# sing_cases_df_temp

In [17]:
def remove_non_numeric(s):
    return re.sub('\D','', s)


In [18]:
# very hacky to workaround with the change from MoH site
sing_cases_df = pd.DataFrame({'active_cases': [int(remove_non_numeric(dfs[active][0][1]))],
                             'hospitalised_stable': [int(remove_non_numeric(dfs[hospitalised_stable][0][1]))],
                              'hospitalised_critical': [int(remove_non_numeric(dfs[hospitalised_critical][0][1]))],
                              'death': [int(remove_non_numeric(dfs[deaths][0][1]))],
#                               'discharge': [int(dfs[discharge][0][1])],
                              'datetime': data_date_time
                             },
                            )
sing_cases_df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,datetime
0,6298,188,1,26,2020-06-24


In [19]:
df = sing_cases_df
df

Unnamed: 0,active_cases,hospitalised_stable,hospitalised_critical,death,datetime
0,6298,188,1,26,2020-06-24


### Save to file

In [20]:
from pathlib import Path
csv_file = '../data/singapore-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.set_index('datetime').append(old_df.set_index('datetime'))



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [21]:
df = df.reset_index()
df.drop_duplicates(subset=['datetime'], inplace=True)

In [22]:
df.sort_values(by=['datetime'], inplace=True)
df

Unnamed: 0,datetime,active_cases,confirmed,death,discharge,discharged,hospitalised,hospitalised_critical,hospitalised_stable,negative,pending
1,2020-02-02,,18.0,,,,,,,240.0,43.0
2,2020-02-03,,24.0,,,,,,,262.0,32.0
3,2020-02-04,,24.0,,,,,,,289.0,20.0
4,2020-02-05,,28.0,,,,,,,295.0,62.0
5,2020-02-06,,33.0,,,,,,,310.0,147.0
...,...,...,...,...,...,...,...,...,...,...,...
140,2020-06-20,7583.0,,26.0,,,,1.0,184.0,,
141,2020-06-21,7127.0,,26.0,,,,1.0,178.0,,
142,2020-06-22,6697.0,,26.0,,,,1.0,199.0,,
143,2020-06-23,6411.0,,26.0,,,,1.0,191.0,,


In [23]:
df.to_csv(csv_file, index=False)
