## Linking of Entities to DB

In [27]:
from bs4 import BeautifulSoup
import re

from tqdm import tqdm
import numpy as np
import pandas as pd

import json

## Get Data from Mongo DB

In [28]:
import os
import yaml
import pymongo

with open("config.yaml", "r") as ymlfile:
    settings = yaml.safe_load(ymlfile)


isRemote = False


try:
    if (isRemote):
        client = pymongo.MongoClient(host = settings['mongo-remote']['host'],
                                     port = settings['mongo-remote']['port'],
                                     username = settings['mongo-remote']['user'],
                                     password = settings['mongo-remote']['pw'],
                                     authMechanism= settings['mongo-remote']['authMechanism'])
        client.server_info()  # force connection on a request as the
        # connect=True parameter of MongoClient seems
        # to be useless here
    else:
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        client.server_info()  # force connection on a request as the
    # connect=True parameter of MongoClient seems
    # to be useless here
except pymongo.errors.ServerSelectionTimeoutError as err:
    # do whatever you need
    print(err)


## Check DB Contents

In [29]:
db = client["parliament"]
articles = db["articles"]

In [30]:
mongo_df = pd.DataFrame.from_records(articles.find())
mongo_df.head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,"{""0"": {""entity"": ""NA"", ""content"": ""[(proc text...",[(proc text) Debate resumed. (proc text)]<br/>...,Society,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[Prof Lim Sun Sun, Miss Cheng Li Hui, An hon M...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Vikram N...",Mr Vikram Nair asked the Minister for Foreign ...,Society,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[Mr Speaker, Assoc Prof Walter Theseira, Mr Vi...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...","{""0"": {""entity"": ""NA"", ""content"": ""Assoc Prof ...",Assoc Prof Walter Theseira asked the Minister ...,Internal Security,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Anthea Ong, Assoc Prof Walter Theseira, Ms...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,"{""0"": {""entity"": ""NA"", ""content"": ""Ms Irene Qu...",Ms Irene Quay Siew Ching asked the Minister fo...,Environment,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[Ms Irene Quay Siew Ching, Dr Lam Pin Min, The...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Lim Biow...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,Employment,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[The Second Minister for Finance, Mr Lim Biow ...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


## Get Entity-Content Dataframe

Here, we create a template dataframe for us to go through each document. Then, as we iterate through each document, an entity-conent mapping is formed.

In [32]:
entities_content = pd.DataFrame(columns=['entity','content','docid'])

In [45]:
import json

for row in tqdm(mongo_df.itertuples()):
    chunks = json.loads(row.chunks)
    for content in chunks.values():
        if content['entity'] != 'NA':
            entities_content = entities_content.append({
            'entity' :  content['entity'],
            'content' : content['content'],
            'docid' : row._1
        }, ignore_index = True)

9531it [02:44, 58.10it/s]


In [46]:
entities_content.head()

Unnamed: 0,entity,content,docid
0,[(proc text) Debate resumed. (proc text)],[(proc text) Debate resumed. (proc text)],5d27eca6172d9aa762d4802f
1,Mr Deputy Speaker,Mr Deputy Speaker: Mr Png Eng Huat.6.57 pm,5d27eca6172d9aa762d4802f
2,Mr Png Eng Huat (Hougang),"Mr Png Eng Huat (Hougang): Sir, if this Bill i...",5d27eca6172d9aa762d4802f
3,Mr Deputy Speaker,Mr Deputy Speaker: Minister Iswaran.7.08 pm,5d27eca6172d9aa762d4802f
4,The Minister for Communications and Informatio...,The Minister for Communications and Informatio...,5d27eca6172d9aa762d4802f


We then map entity and content to party and designation using the `fuzzywuzzy` library.

In [47]:
consolidated_names = pd.read_csv('consolidated.csv')
consolidated_names.drop(columns='Unnamed: 0',inplace=True)


In [48]:
consolidated_names.head()

Unnamed: 0,clean_designation,constituency,designation,name,party
0,Nominated Member of Parliament,,Nominated Member of Parliament,Maurice Choo,Nominated Member of Parliament
1,Nominated Member of Parliament,,Nominated Member of Parliament,Leong Chee Whye,Nominated Member of Parliament
2,Nominated Member of Parliament,,Nominated Member of Parliament,Chia Shi Teck,Nominated Member of Parliament
3,Nominated Member of Parliament,,Nominated Member of Parliament,Robert Chua Teck Chew,Nominated Member of Parliament
4,Nominated Member of Parliament,,Nominated Member of Parliament,Kanwaljit Soin,Nominated Member of Parliament


In [49]:
from fuzzywuzzy import process, fuzz

In [50]:
def return_names(entity):
    if entity != '':
        try:
            matched_idx = process.extractOne(entity,consolidated_names.name,scorer=fuzz.token_set_ratio, score_cutoff=80) [2]
            name = consolidated_names.iloc[matched_idx]['name']
        except TypeError:
            name = 'NA'
            pass
        return name
    else:
        pass


In [None]:
%time entities_content['lookup_name'] = entities_content.entity.map(return_names)
                                                               

In [52]:
entities_content[entities_content['lookup_name'] != "NA"]
entities_content = entities_content[entities_content['lookup_name'] != "NA"]

In [53]:
entities_content['words_len'] = entities_content.content.map(lambda x : len(x.split()))
entities_content['speech_mins'] = entities_content.content.map(lambda x : round(len(x.split())/265,2))

A reference table is created that contains all relevant metadata associated with the content of the speech.

In [54]:
## merge with master dataframe for additional meta deta
reference_table = entities_content.merge(mongo_df,how="inner",right_on="_id",left_on="docid")

In [55]:
filter_columns= ['entity','content','docid','lookup_name','words_len','speech_mins',
                 'dominant_topic','parliament_num','session_num','sitting_date','sitting_num']

reference_table = reference_table[filter_columns]

In [56]:
reference_table['year'] = reference_table.sitting_date.map(lambda x : x.year)

## Trend of speech time against years

In [57]:
reference_table.head()

Unnamed: 0,entity,content,docid,lookup_name,words_len,speech_mins,dominant_topic,parliament_num,session_num,sitting_date,sitting_num,year
0,Mr Png Eng Huat (Hougang),"Mr Png Eng Huat (Hougang): Sir, if this Bill i...",5d27eca6172d9aa762d4802f,PNG ENG HUAT,1802,6.8,Society,13,2,2019-05-08,105,2019
1,The Minister for Communications and Informatio...,The Minister for Communications and Informatio...,5d27eca6172d9aa762d4802f,S ISWARAN,3903,14.73,Society,13,2,2019-05-08,105,2019
2,Dr Chia Shi-Lu (Tanjong Pagar),Dr Chia Shi-Lu (Tanjong Pagar): Mr Deputy Spea...,5d27eca6172d9aa762d4802f,CHIA SHI-LU,1771,6.68,Society,13,2,2019-05-08,105,2019
3,Mr Darryl David (Ang Mo Kio),Mr Darryl David (Ang Mo Kio): Mr Deputy Speake...,5d27eca6172d9aa762d4802f,DARRYL DAVID,802,3.03,Society,13,2,2019-05-08,105,2019
4,Mr Louis Ng Kok Kwang (Nee Soon),"Mr Louis Ng Kok Kwang (Nee Soon): Sir, online ...",5d27eca6172d9aa762d4802f,LOUIS NG KOK KWANG,704,2.66,Society,13,2,2019-05-08,105,2019


In [85]:
reference_table.groupby(by="year")['speech_mins'].sum()

year
2009    4697.23
2010    4517.98
2011    4443.71
2012    7008.87
2013    6421.03
2014    7157.26
2015    5125.50
2016    7931.82
2017    8085.80
2018    8920.30
2019    5693.41
Name: speech_mins, dtype: float64

In [59]:
reference_table = reference_table.merge(consolidated_names,how="inner",left_on="lookup_name",right_on="name")

In [60]:
reference_table.head()

Unnamed: 0,entity,content,docid,lookup_name,words_len,speech_mins,dominant_topic,parliament_num,session_num,sitting_date,sitting_num,year,clean_designation,constituency,designation,name,party
0,Mr Png Eng Huat (Hougang),"Mr Png Eng Huat (Hougang): Sir, if this Bill i...",5d27eca6172d9aa762d4802f,PNG ENG HUAT,1802,6.8,Society,13,2,2019-05-08,105,2019,Member of Parliament,Hougang,Member of Parliament,PNG ENG HUAT,Workers' Party
1,Mr Png Eng Huat (Hougang),"Mr Png Eng Huat (Hougang): Sir, if this Bill i...",5d27eca6172d9aa762d4802f,PNG ENG HUAT,1802,6.8,Society,13,2,2019-05-08,105,2019,,,,PNG ENG HUAT,Workers' Party
2,Mr Png Eng Huat,"Mr Png Eng Huat: Thank you, Deputy Speaker. Ju...",5d27eca6172d9aa762d4802f,PNG ENG HUAT,65,0.25,Society,13,2,2019-05-08,105,2019,Member of Parliament,Hougang,Member of Parliament,PNG ENG HUAT,Workers' Party
3,Mr Png Eng Huat,"Mr Png Eng Huat: Thank you, Deputy Speaker. Ju...",5d27eca6172d9aa762d4802f,PNG ENG HUAT,65,0.25,Society,13,2,2019-05-08,105,2019,,,,PNG ENG HUAT,Workers' Party
4,Mr Png Eng Huat,Mr Png Eng Huat: — but POFMA is supposed to ad...,5d27eca6172d9aa762d4802f,PNG ENG HUAT,14,0.05,Society,13,2,2019-05-08,105,2019,Member of Parliament,Hougang,Member of Parliament,PNG ENG HUAT,Workers' Party


In [61]:
reference_table.party.value_counts()

People's Action Party             35468
Workers' Party                     3256
Nominated Member of Parliament     2537
Singapore People's Party             35
Presidential Office                   3
Barisan Sosialis                      1
Name: party, dtype: int64

In [62]:
reference_table[reference_table['party']=='Pertubuhan Kebangsaan Melayu Singapura']

Unnamed: 0,entity,content,docid,lookup_name,words_len,speech_mins,dominant_topic,parliament_num,session_num,sitting_date,sitting_num,year,clean_designation,constituency,designation,name,party


In [63]:
reference_table[reference_table['party']=='Singapore People\'s Party'].head(6)

Unnamed: 0,entity,content,docid,lookup_name,words_len,speech_mins,dominant_topic,parliament_num,session_num,sitting_date,sitting_num,year,clean_designation,constituency,designation,name,party
40062,Mr Chiam See Tong (Potong Pasir),Thank you for giving me the opportunity to sp...,5d27eca6172d9aa762d49929,CHIAM SEE TONG,193,0.73,Law,11,2,2011-03-09,25,2011,,,,CHIAM SEE TONG,Singapore People's Party
40063,Mr Chiam See Tong,Children and adults with special needs deserv...,5d27eca6172d9aa762d49973,CHIAM SEE TONG,104,0.39,Healthcare,11,2,2011-03-04,22,2011,,,,CHIAM SEE TONG,Singapore People's Party
40064,Mr Chiam See Tong (Potong Pasir),"In November 2009, it was reported that IRAS d...",5d27eca6172d9aa762d49adf,CHIAM SEE TONG,117,0.44,Business,11,2,2010-03-09,22,2010,,,,CHIAM SEE TONG,Singapore People's Party
40065,Mr Chiam See Tong (Potong Pasir),"Sir, at the moment, about 40% of places in on...",5d27eca6172d9aa762d4a033,CHIAM SEE TONG,115,0.43,Education,11,1,2009-02-10,14,2009,,,,CHIAM SEE TONG,Singapore People's Party
40066,Mr Chiam See Tong (Potong Pasir),"Thank you, Sir, for allowing me to join in th...",5d27eca6172d9aa762d4a0da,CHIAM SEE TONG,177,0.67,Business,11,2,2011-03-10,26,2011,,,,CHIAM SEE TONG,Singapore People's Party
40067,Mr Chiam See Tong,"Sir, Singapore's sovereign wealth fund compan...",5d27eca6172d9aa762d4a0e6,CHIAM SEE TONG,106,0.4,Employment,11,2,2011-03-09,25,2011,,,,CHIAM SEE TONG,Singapore People's Party


In [64]:
reference_table['lookup_name'] = reference_table.lookup_name.map(lambda x : 'Faizah binte Haji Ahmad Jamal'if x == 'JAMAL BIN IDRIS'else x)
reference_table['name'] = reference_table.name.map(lambda x: 'Faizah binte Haji Ahmad Jamal' if x == 'JAMAL BIN IDRIS' else x) 
reference_table['party']= reference_table.party.map(lambda x: 'Nominated Member of Parliament' if x == 'Pertubuhan Kebangsaan Melayu Singapura'else x)

In [65]:
party_speeches = reference_table.groupby(by=['party','year']).sum().reset_index()

In [66]:
count_mp_year = reference_table[['name','party','year']].drop_duplicates().groupby(by=['party','year']).count()

In [67]:
count_mp_year.reset_index(inplace=True)

In [68]:
avg_speech_length = party_speeches[party_speeches['speech_mins']>10][['party','year','speech_mins']]

In [69]:
avg_speech_length.head()

Unnamed: 0,party,year,speech_mins
1,Nominated Member of Parliament,2009,408.99
2,Nominated Member of Parliament,2010,188.4
3,Nominated Member of Parliament,2011,145.06
4,Nominated Member of Parliament,2012,301.66
5,Nominated Member of Parliament,2013,346.85


In [70]:
avg_speech_length['key'] =  avg_speech_length[['party','year']].apply(lambda x : str(x.party) + str(x.year),axis=1)

In [71]:
avg_speech_length.head()

Unnamed: 0,party,year,speech_mins,key
1,Nominated Member of Parliament,2009,408.99,Nominated Member of Parliament2009
2,Nominated Member of Parliament,2010,188.4,Nominated Member of Parliament2010
3,Nominated Member of Parliament,2011,145.06,Nominated Member of Parliament2011
4,Nominated Member of Parliament,2012,301.66,Nominated Member of Parliament2012
5,Nominated Member of Parliament,2013,346.85,Nominated Member of Parliament2013


In [72]:
count_mp_year['key'] = count_mp_year[['party','year']].apply(lambda x : str(x.party) + str(x.year),axis=1)

In [73]:
count_mp_year.head()

Unnamed: 0,party,year,name,key
0,Barisan Sosialis,2009,1,Barisan Sosialis2009
1,Nominated Member of Parliament,2009,13,Nominated Member of Parliament2009
2,Nominated Member of Parliament,2010,7,Nominated Member of Parliament2010
3,Nominated Member of Parliament,2011,9,Nominated Member of Parliament2011
4,Nominated Member of Parliament,2012,11,Nominated Member of Parliament2012


In [74]:
merged_counts = avg_speech_length.merge(count_mp_year,on='key',how='left')[['party_x','year_x','speech_mins','name']]

In [75]:
merged_counts.columns

Index(['party_x', 'year_x', 'speech_mins', 'name'], dtype='object')

In [76]:
merged_counts.rename(columns = {'party_x':'party','year_x':'year', 'name':'member_count'},inplace=True)

In [88]:
merged_counts.head(3)

Unnamed: 0,party,year,speech_mins,member_count
0,Nominated Member of Parliament,2009,408.99,13
1,Nominated Member of Parliament,2010,188.4,7
2,Nominated Member of Parliament,2011,145.06,9


In [89]:
merged_counts.to_csv('dumps/party_speechtime.csv')

## Append Speech meta-data with party information

In [78]:
sanctioned_list = ['National Solidarity Party','Nominated Member of Parliament',
                  "People's Action Party","Presidential Office","Singapore People's Party",
                  "Workers' Party"]


def add_party(chunks):
    for val in chunks:
        try:
            if val['entity'] != '' or val['entity'] != [')','.']:
                try:
                    matched_idx = process.extractOne(val['entity'],consolidated_names.name,scorer=fuzz.token_set_ratio, score_cutoff=80)[2]
                    party = consolidated_names.iloc[matched_idx].party
                    if party in sanctioned_list:
                        val['party'] = party
                    else:
                        val['party'] = "NA"
                except TypeError:
                    val['party'] = "NA"
                    pass
            else:
                pass
        except KeyError:
            pass
    return chunks

In [79]:
add_party(mongo_df.iloc[8300].parsed_convo)

[{'content': 'The Leader of the House (Dr Ng Eng Hen): Mr Deputy Speaker, Sir, can I seek your consent to move that the debate be now adjourned?',
  'type': 'response',
  'entity': 'The Leader of the House (Dr Ng Eng Hen)',
  'party': "People's Action Party"},
 {'content': 'Mr Deputy Speaker: You have my consent.Resolved,That the debate be now adjourned. – [Dr Ng Eng Hen].',
  'type': 'response',
  'entity': 'Mr Deputy Speaker',
  'party': 'NA'},
 {'content': 'Mr Deputy Speaker: Dr Lam Pin Min, resumption of debate, what day?',
  'type': 'response',
  'entity': 'Mr Deputy Speaker',
  'party': 'NA'},
 {'content': 'Dr Lam Pin Min (Sengkang West): Tomorrow, Sir.',
  'type': 'response',
  'entity': 'Dr Lam Pin Min (Sengkang West)',
  'party': "People's Action Party"},
 {'content': 'Mr Deputy Speaker: So be it.',
  'type': 'response',
  'entity': 'Mr Deputy Speaker',
  'party': 'NA'},
 {'type': 'compiled_responses',
  'content': '\n\nMr Deputy Speaker: You have my consent.Resolved,That the 

In [None]:
%time mongo_df.chunks = mongo_df.parsed_convo.map(add_party)

In [83]:
mongo_df.iloc[30].parsed_convo

[{'content': 'Ms Anthea Ong asked the Minister for Health (a) what is the percentage of persons over 65 years of age who want to age in place; and (b) what is the percentage of care-givers of persons over the age of 65 years who want to see their care recipients age in place.',
  'type': 'question',
  'entity': 'Ms Anthea Ong',
  'party': 'Nominated Member of Parliament'},
 {'content': 'Mr Gan Kim Yong: Most seniors prefer to age in place for as long as they can, within communities which they are familiar with, and supported by their loved ones. A study by the Duke-NUS Centre for Ageing Research and Education (CARE) conducted from 2011 to 2012 found that 97.9% of persons aged 65 and above prefer to continue staying at home if they cannot live independently, supported by family, foreign domestic workers, or home care services. Only 2.1% of persons aged 65 and above preferred to move to a nursing home if they cannot live independently. We are not aware of corresponding data on the prefer

## Updating our document DB with the information

In [82]:
from tqdm import tqdm

for row in tqdm(mongo_df.itertuples()):
    query = { "_id" : row._1 }
    value = { "$set": { "chunks": row.chunks,
                      "parsed_convo": row.parsed_convo } }
    articles.update_one(query,value)

9531it [00:52, 180.82it/s]
