# Scrape Singapore Hansard

## Import Libraries

In [2]:
from bs4 import BeautifulSoup
import re
import os
import datetime
import numpy as np
from time import sleep, process_time
import pandas
import random
from datetime import date 
import requests


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException,ElementNotInteractableException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from json import JSONDecodeError

from db.models import Link, Article, mp_record, MpResponse, \
    MemParliament, ApptRecord, CommitteeRecord, \
    ConstituencyRecord, postgres_engine
from db.db_utils import get_or_create

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

from multiprocessing import Pool, TimeoutError

In [3]:
title = "Protection from Online Falsehoods and Manipulation Bill"
url = "https://sprs.parl.gov.sg/search/sprs3topic?reportid=bill-367" 
sitting_date = "8-5-2019"



In [4]:
def parse_sittingdate(subtext):
    
    sitting_date = [int(k) for k in re.search(r"(?<=\:)(.*?)(?=\,)", subtext).group(0).strip().split('-')]
    d = date(sitting_date[2],sitting_date[1],sitting_date[0])
    
    return d

parse_sittingdate("Sitting Date: 8-5-2019 , Vol:94")

datetime.date(2019, 5, 8)

In [5]:
url = "https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-1987"
re.findall(r'(?<=sprs3topic\?reportid=).*', url)[0]

'oral-answer-1987'

In [None]:
url_old_1 = "https://sprs.parl.gov.sg/search/topic?reportid=030_20110214_S0007_T0002"
url = "https://sprs.parl.gov.sg/search/topic?reportid=034_20120116_S0015_T0004"
url_old = "https://sprs.parl.gov.sg/search/topic?reportid=020_20110411_S0004_T0002" #with changes to API
res_t = re.findall(r'(?<=\?reportid=).*', url_old)[0]

def parse_res_old(res):
    uri = "https://sprs.parl.gov.sg/search/getHansardTopic/?id=" + res
    code = int(session.post(uri).status_code)
    
    
    if code==200:
        result = session.post(uri).json()['htmlContent']
        b = BeautifulSoup(result,'lxml')
        raw = b.get_text()
        d = b.find_all('meta',{'name':"Parl_No"}) ## check which API it belongs under
        if d: ## for deprecated API (old v1)
            session_type = b.find('meta',{'name':"Sect_Name"})['content']
            sitting_num = int(re.findall(r'(?s)(?<=Sitting No:)(\d+?)(?=\nSitting Date|\Sitting Date)',raw)[0].strip())
            session_num = b.find('meta',{'name':"Sess_No"})['content']
            parliament_num = b.find('meta',{'name':"Parl_No"})['content']
            volume_num = b.find('meta',{'name':"Vol_No"})['content']
            article_text = re.findall(r'(?s)(?<=<div align="left">).*?(?=<\/html>)',str(b))[0]
            return session_type,sitting_num,session_num,parliament_num,volume_num,article_text
        else:
            ## for recent API (old v2)
#             raw = b.get_text()
            title = re.findall(r'(?s)(?<=Title:).*?(?=MPs)',raw)[0].strip().replace('\r\n',' ')
            exclude = ['Assent to Bills Passed','Administration of Oaths']

            if title not in exclude:
                session_type = str(re.findall(r'(?<=Section Name:)(.*?)(?=Title)',raw)[0].strip())
                sitting_num = int(re.findall(r'(?<=Sitting No:)(.*?)(?=Sitting)',raw)[0].strip())
                session_num = int(re.findall(r'(?<=Session No:)(.*?)(?=Volume)',raw)[0].strip())
                parliament_num = int(re.findall(r'(?<=Parliament No:)(.*?)(?=Session)',raw)[0].strip())
                volume_num = int(re.findall(r'(?<=Volume No:)(.*?)(?=Sitting)',raw)[0].strip())
                article_text = str(b.find("div", class_ = "body hansardBaseBody hansardContenteBody"))
            else:
                session_type = title
                sitting_num = int(re.findall(r'(?<=Sitting No:)(.*?)(?=Sitting)',raw)[0].strip())
                session_num = int(re.findall(r'(?<=Session No:)(.*?)(?=Volume)',raw)[0].strip())
                parliament_num = int(re.findall(r'(?<=Parliament No:)(.*?)(?=Session)',raw)[0].strip())
                volume_num = int(re.findall(r'(?<=Volume No:)(.*?)(?=Sitting)',raw)[0].strip())
                article_text = str(b.find("div", class_ = "body hansardBaseBody hansardContenteBody"))

            return session_type,sitting_num,session_num,parliament_num,volume_num,article_text
    else:
        raise ValueError
    
parse_res_old(res_t)

In [None]:
url = "https://sprs.parl.gov.sg/search/sprs3topic?reportid=written-answer-na-962"
res_t = re.findall(r'(?<=\?reportid=).*', url)[0]

def parse_res(res):
    uri = "https://sprs.parl.gov.sg/search/getHansardTopic/?id=" + res
    result = session.post(uri).json()
    r_type = result['type']
    
    if r_type !='atbp':
        w = result['resultHTML']
        session_type = w['reportType']
        sitting_num = w['sittingNo']
        session_num = w['sessionNo']
        parliament_num = w['parlNo']
        volume_num = w['volumeNo']
        article_text = w['content']
        return session_type,sitting_num,session_num,parliament_num,volume_num,article_text
    else:
        w = result['resultData']
        session_type = r_type
        sitting_num = w['sittingNo']
        session_num = w['sessionNo']
        parliament_num = w['parlNo']
        volume_num = w['volumeNo']
        article_text = str(w['atbpList'])
        return session_type,sitting_num,session_num,parliament_num,volume_num,article_text
    

parse_res(res_t)

In [8]:
def assign_parser(url):
    res = re.findall(r'(?<=\?reportid=).*', url)[0]
    if 'sprs3topic' in url:
        return parse_res(res)
    else:
        return parse_res_old(res)
        

In [9]:
assign_parser('https://sprs.parl.gov.sg/search/topic?reportid=030_20110214_S0007_T0002')

('ORAL ANSWERS TO QUESTIONS NOT REACHED',
 16,
 '2',
 '11',
 '87',
 ' <p><span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'">\xa0\xa0\xa0\xa0\xa022.</span>\xa0\xa0\t        <!--MP_NAME:Assoc. Prof. Paulin Tay Straughan--><span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'"><b>Assoc. Prof. Paulin Tay Straughan </b>asked the Minister for Manpower if he can provide data on (a) the proportion of employers aside from the Civil Service that offer paternity leave and unrecorded childcare leave as part of employee benefits; and (b) the proportion of unrecorded childcare leave that is taken by fathers.</span></p> <span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'"><!--MP_NAME:Mr Gan Kim Yong--><p align="left">\xa0\xa0\xa0\xa0\xa0<b>Mr Gan Kim Yong:</b></p><p>\xa0\xa0\xa0\xa0\xa0\xa0 Even though paternity leave is not specifically legislated, MOM’s survey showed that about 48% of private sector establishments with at least 25 employees provided paternity leave 

In [13]:
HANSARD_URL = "https://sprs.parl.gov.sg/search/home"

# Firefox session


today_date = datetime.datetime.today()
parse_date = today_date.strftime("%Y-%m-%d")
timeout=5

def fetch_links():
    
    
    driver = webdriver.Firefox(executable_path='./geckodriver-osx')
    url = HANSARD_URL
    driver.get(url)
    assert "Search" in driver.title
    sleep(2)
    try:
        date_range = driver.find_element_by_class_name("date-range")
        Select(date_range.find_element_by_class_name("form-control")).select_by_visible_text('Specified range')
        sleep(1)
        
        # Specify Time Range
        Select(driver.find_element_by_id("fromday")).select_by_visible_text('1')
        Select(driver.find_element_by_id("frommonth")).select_by_visible_text('1')
        Select(driver.find_element_by_id("fromyear")).select_by_visible_text('1995')
        Select(driver.find_element_by_id("today")).select_by_visible_text('12')
        Select(driver.find_element_by_id("tomonth")).select_by_visible_text('3')
        Select(driver.find_element_by_id("toyear")).select_by_visible_text('2004')
        
        # Go to search results
        driver.find_element_by_css_selector(".btn-black[label='Search']").click()
        driver.switch_to_window(driver.window_handles[-1])       
        sleep(5)
        print(f"Current URL : {driver.current_url}")
        query_result = driver.find_element_by_class_name("showingResults").text
        
        
        num_results = int(re.search("\w+$", query_result).group(0))
        # curr_cnt = initial_cnt
        page_cnt = 1
        link_cnt = 1
        print("\n ////////////////// NEW PAGE //////////////////////// \n")
        print("Number of results : {}".format(query_result))
        while link_cnt <= num_results:
            if page_cnt > 1 :
                sleep(1)
                print("\n ////////////////// NEW PAGE //////////////////////// \n")
                print("Number of results so far : {}".format(link_cnt))

            print("Current page count: {}".format(page_cnt))
            # curr_cnt = int(re.findall(r"\d+(?![of])", query_result)[1])
            results = driver.find_elements_by_tag_name("tbody")
            result_idx = 0
#             print(results)
            while result_idx < len(results):
                try:
                    sleep(3)
                    print(f"Current ID = {result_idx}")
                    print("Processing")
                    result = results[result_idx]
                    title = result.find_element_by_tag_name("a").text # get report name
                    sub_text = result.find_element_by_tag_name("i").text # get subtext 

                    result.find_element_by_tag_name("a").click() # opens a new tab 

                    driver.switch_to_window(driver.window_handles[-1])
                    sleep(3)
                    current_url = driver.current_url
                    while current_url == 'about:blank':
                        print("blank url, retrying")
                        sleep(2)
                        current_url = driver.current_url

                    source = driver.page_source
                    res_url = re.findall(r'(?<=\?reportid=).*', current_url)[0]
                    print(f"Current URL : {driver.current_url}, Resource = {res_url}")
                    
                    session_type,sitting_num,session_num,parliament_num,volume_num,article_text = assign_parser(current_url)

                    link_entry = get_or_create(Link, title=title, res_url=res_url,
                                               src_url=current_url, 
                                               sitting_date=parse_sittingdate(sub_text))

                    get_or_create(Article,link_article=link_entry,parliament_num=parliament_num,
                                                           volume_num=volume_num,
                                                           sitting_num=sitting_num,
                                                           session_num=session_num,
                                                           session_type=session_type,
                                                           article_text=article_text
                                                          )
#                   
                    result_idx += 1
                    link_cnt+=1
                    driver.close()
                    sleep(random.randint(1,3))
                    driver.switch_to_window(driver.window_handles[-1])
                except IndexError:
                    print("Index of JSON not found,skipping")
                    sleep(1)
                    result_idx += 1
                    link_cnt+=1
                    driver.close()
                    sleep(random.randint(1,3))
                    driver.switch_to_window(driver.window_handles[-1])
                    continue
                except ElementNotInteractableException:
                    print(f"No Element Exists at {result_idx}, go to next")
                    result_idx += 1
                    link_cnt+=1
                    sleep(5)
                    continue
                except NoSuchElementException:
                    print(f"No Element Exists at {result_idx}, go to next")
                    result_idx += 1
                    link_cnt+=1
                    sleep(5)
                    continue
                except JSONDecodeError:
                    print(f"JSON Error on:{res_url} , retrying")
                    driver.close()
                    sleep(random.randint(1,3))
                    driver.switch_to_window(driver.window_handles[-1])
                    pass
                except AttributeError:
                    print(f"JSON not found at {result_idx}, go to next")
                    result_idx += 1
                    link_cnt+=1
                    driver.close()
                    sleep(random.randint(1,3))
                    driver.switch_to_window(driver.window_handles[-1])
                    sleep(5)
                    continue
                except ValueError:
                    print(f"Value Error on:{res_url} , skipping")
                    result_idx += 1
                    link_cnt+=1
                    driver.close()
                    sleep(random.randint(1,3))
                    driver.switch_to_window(driver.window_handles[-1])
                    continue
                    
                    
            page_cnt+=1
            sleep(4)
            next_btn = driver.find_element_by_class_name("fa-angle-right")
            if (next_btn):
                next_btn.click()
                sleep(random.randint(1,3))
            else:
                break

    except Exception as e:
        driver.save_screenshot('error.png')
        print(str(e))
    except KeyboardInterrupt:
        return source
    finally:
        print("Total of {} links processed".format(link_cnt))
        driver.quit()

In [None]:
%%timeit
src = fetch_links()