# Data Cleaning

In this section, various methods of data cleaning are demonstrated on our dataset. Namely, we will be mapping the HTML content into a text friendly version.

## Import libraries

In [6]:
from bs4 import BeautifulSoup
import re

import datetime
import numpy as np
import pandas as pd
from datetime import date 
import spacy

from tqdm import tqdm

import en_core_web_lg
from text_utils.cleaning import parse_hansard_text, seperate_content
from text_utils.metrics import get_chunks_info
from db.models import postgres_engine

from bson import json_util

## Get Data from Postgres DB

In [6]:
df = pd.read_sql_query('select * from Article INNER JOIN Link on Link.id=Article.article_id',con=postgres_engine)

## Check DB Contents

In [7]:
df.tail()

Unnamed: 0,article_id,link_id,parliament_num,volume_num,sitting_num,session_num,session_type,article_text,created_on,updated_on,id,title,res_url,src_url,sitting_date,created_on.1,updated_on.1
12003,12018,12018,10,77,8,1,BUDGET,"<span style=""FONT-SIZE: 13pt; FONT-FAMILY: 'Ti...",2019-07-09 05:16:44.595440,,12018,HEAD N - MINISTRY OF FOREIGN AFFAIRS,007_20040311_S0003_T0004,https://sprs.parl.gov.sg/search/topic?reportid...,2004-03-11,2019-07-09 05:16:44.584810,
12004,12019,12019,10,77,8,1,BUDGET,"<span style=""FONT-SIZE: 13pt; FONT-FAMILY: 'Ti...",2019-07-09 05:16:53.701711,,12019,HEAD M - MINISTRY OF FINANCE,006_20040311_S0003_T0003,https://sprs.parl.gov.sg/search/topic?reportid...,2004-03-11,2019-07-09 05:16:53.695288,
12005,12020,12020,10,77,8,1,BUDGET,"<span style=""FONT-SIZE: 13pt; FONT-FAMILY: 'Ti...",2019-07-09 05:17:01.895183,,12020,HEAD U - PRIME MINISTER'S OFFICE,005_20040311_S0003_T0006,https://sprs.parl.gov.sg/search/topic?reportid...,2004-03-11,2019-07-09 05:17:01.888632,
12006,12021,12021,10,77,8,1,BUDGET,"<span style=""FONT-SIZE: 13pt; FONT-FAMILY: 'Ti...",2019-07-09 05:17:11.742969,,12021,HEAD P - MINISTRY OF HOME AFFAIRS,008_20040311_S0003_T0005,https://sprs.parl.gov.sg/search/topic?reportid...,2004-03-11,2019-07-09 05:17:11.736344,
12007,12022,12022,10,77,8,1,BUDGET,"<span style=""FONT-SIZE: 13pt; FONT-FAMILY: 'Ti...",2019-07-09 05:17:19.439789,,12022,HEAD D - CABINET OFFICE,004_20040311_S0003_T0002,https://sprs.parl.gov.sg/search/topic?reportid...,2004-03-11,2019-07-09 05:17:19.431688,


In [8]:
df.session_type.value_counts()

Written Answers to Questions for Oral Answer Not Answered by 3.00 pm    2322
Written Answers to Questions                                            2271
Oral Answers to Questions                                               1815
ORAL ANSWERS TO QUESTIONS                                               1173
Motions                                                                  785
ORAL ANSWERS TO QUESTIONS NOT REACHED                                    615
BILLS                                                                    611
WRITTEN ANSWERS TO QUESTIONS                                             469
Second Reading Bills                                                     356
MOTIONS                                                                  356
Bills Introduced                                                         299
BUDGET                                                                   250
President's Address                                                      113

In [9]:
df.session_type.isna().sum()

17

## Clean up session types

We can see that there are various session types, so we will attempt to streamline the session types.

In [10]:
df.session_type = df[['session_type','title']].apply(lambda x : x.session_type if x.session_type is not None else x.title,axis=1  )


In [11]:
df.session_type.isna().sum()

0

In [12]:
df.session_type.value_counts()

Written Answers to Questions for Oral Answer Not Answered by 3.00 pm                 2322
Written Answers to Questions                                                         2271
Oral Answers to Questions                                                            1815
ORAL ANSWERS TO QUESTIONS                                                            1173
Motions                                                                               785
ORAL ANSWERS TO QUESTIONS NOT REACHED                                                 615
BILLS                                                                                 611
WRITTEN ANSWERS TO QUESTIONS                                                          469
Second Reading Bills                                                                  356
MOTIONS                                                                               356
Bills Introduced                                                                      299
BUDGET    

In [13]:
# Convert to all upper case
df.session_type = df.session_type.map(lambda x : x.upper())
# Shorten any clarification XXXX to CLARIFICATION
df.session_type = df.session_type.map(lambda x : 'CLARIFICATION' if re.search('CLARIFICATION',x) else x)
# Convert ATBP to 'ASSENTS TO BILLS PASSED'
df.session_type = df.session_type.map(lambda x : 'ASSENTS TO BILLS PASSED' if re.search('ATBP',x) else x)
# remove \t\r\n
df.session_type = df.session_type.map(lambda x : re.sub('\t|\r\n','',x))
# Convert all variations of written answers to WRITTEN ANSWERS
df.session_type = df.session_type.map(lambda x : 'WRITTEN ANSWERS' if re.search('WRITTEN ANSWER',x) else x)
# Convert all variations of oral answers to ORAL ANSWERS
df.session_type = df.session_type.map(lambda x : 'ORAL ANSWERS' if re.search('ORAL ANSWER',x) else x)
# Clean 'PRESIDENT'S ADDRESS"
df.session_type = df.session_type.map(lambda x : "PRESIDENT'S ADDRESS" if re.search("PRESIDENT'S ADDRESS",x) else x)
# Clean MINISTERIAL STATEMENT
df.session_type = df.session_type.map(lambda x : "MINISTERIAL STATEMENTS" if re.search("MINISTERIAL STATEMENT",x) else x)
# Clean BILLS INTRODUCED
df.session_type = df.session_type.map(lambda x : "(BILL INTRODUCED" if re.search("(BILL|BILL'S) INTRODUCED",x) else x)
# Clean MOTION
df.session_type = df.session_type.map(lambda x : "MOTIONS" if re.search("MOTION",x) else x)

In [14]:
df.session_type.value_counts()

WRITTEN ANSWERS            5132
ORAL ANSWERS               3621
MOTIONS                    1148
BILLS                       625
SECOND READING BILLS        356
BUDGET                      356
BILLS INTRODUCED            299
PRESIDENT'S ADDRESS         181
ASSENTS TO BILLS PASSED     106
MINISTERIAL STATEMENTS       64
ANNOUNCEMENT BY SPEAKER      26
SPEAKER                      22
MISCELLANEOUS                21
CLARIFICATION                14
ADMINISTRATION OF OATHS      12
TRIBUTES                      8
ASSENT TO BILLS PASSED        6
(BILL INTRODUCED              3
PETITIONS                     3
DEPUTY SPEAKER                2
ADMINISTRATION OF OATH        2
OBITUARY SPEECHES             1
Name: session_type, dtype: int64

In [15]:
df.columns

Index(['article_id', 'link_id', 'parliament_num', 'volume_num', 'sitting_num',
       'session_num', 'session_type', 'article_text', 'created_on',
       'updated_on', 'id', 'title', 'res_url', 'src_url', 'sitting_date',
       'created_on', 'updated_on'],
      dtype='object')

In [16]:
# select only articles between 2019 and 2009 (10 years)
year_mask = (df.sitting_date.dt.year >= 2009) & (df.sitting_date.dt.year <= 2019)
df_year = df[year_mask].copy()

# select only Answers
df_year.sort_values(by='session_type',na_position='first').head()

# drop None session type and only get "answers"
columns = ['parliament_num','volume_num',
           'sitting_num','session_num',
           'session_type','article_text',
          'title','sitting_date','src_url']
df = df_year[columns].copy()

In [17]:
df.iloc[1].article_text[:400] # new API

'<p class="ql-align-justify">4 <strong>Mr Vikram Nair</strong> asked&nbsp;the Minister for Foreign Affairs whether he can provide an update on the state of bilateral relations with Malaysia following the 9th Malaysia-Singapore Leaders\' Retreat.&nbsp;</p><p class="ql-align-justify"><strong>\tThe Minister for Foreign Affairs (Dr Vivian Balakrishnan)</strong>: Mr Speaker, Malaysia hosted the 9<sup>th</'

In [18]:
df.iloc[8400].article_text[:400] # old API v1

'<span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'"><p>\xa0\xa0\xa0\xa0 15.\xa0<b>Dr Lim Wee Kiak</b> asked the Minister for Transport following the recent errors in calculation of distances travelled by buses (a) how many commuters or EZ-</p><p align="left">Column: 2215</p><p>\r\nLink card holders were affected; (b) whether there is a more convenient alternative for less literate commuters such as senio'

In [19]:
df.iloc[8290].article_text[:800] # old API v2

'<div class="body hansardBaseBody hansardContenteBody" id="id-d2f0fd7a-ff91-410d-82b7-f6a1e5a30855"><a name="id-d2f0fd7a-ff91-410d-82b7-f6a1e5a30855"><!--a--></a><div class="section hcSection hcQuestion" id="id-23e4d288-a0e0-4d63-8218-78508b6bbf99"><a name="id-16cdf09d-8c4f-46dc-89aa-293369475309__id-23e4d288-a0e0-4d63-8218-78508b6bbf99"><!--a--></a><p class="p hcParagraph" id="id-52dd1191-0966-489e-8f18-fabc034dda98"><a name="id-16cdf09d-8c4f-46dc-89aa-293369475309__id-52dd1191-0966-489e-8f18-fabc034dda98"><!--a--></a><span class="ph hcInline hcQuestionNumber">3 </span><span class="ph hcInline hcMember"><strong class="ph b">Ms Sylvia Lim</strong></span> asked\r\nthe Minister for Trade and Industry if he would provide an interim\r\nassessment on the impact of the Shell oil refinery fire and fac'

## Cleaning HTML

Here we do the following:
1. Remove HTML Markup
2. Create a list in which text are seperated into chunks from one speaker to the next

A function `parse_hansard_text` was created. An attempt was made to run the parser on the newer API.

In [16]:
import unicodedata

# display(df.iloc[8270].article_text)
test = df.iloc[8270].article_text
display(df.iloc[8270].src_url)
print("\n")
# display(test)
# # print("\n")
bs = BeautifulSoup(test, 'lxml')
# print(bs.original_encoding)
if (bs.find('div',{'align':"left"})):
    print("old API v1")
    t = [unicodedata.normalize("NFKD",re.sub('((Page|Column): \d+)' ,' ',BeautifulSoup(t, 'html5lib').get_text(strip=True))).replace('. ','.') for t in test.split('<b>')]
    display(t[1:])
    p = t[2].split(':', 1)  # only select 2 terms in split
    if len(p) > 1:
        entity = p[0]
        content = " ".join([e.strip() for e in p[1].split('.')])
elif (bs.find('div',{'class':"body hansardBaseBody hansardContenteBody"})):
    print("old API v2")
    t = [unicodedata.normalize("NFKD",re.sub('((Page|Column): \d+)' ,' ',BeautifulSoup(t, 'html5lib').get_text(strip=True).replace('\n',' '))) for t in test.split('<strong class="ph b">')]
    display(t[1:])
    p = t[2].split(':', 1)  # only select 2 terms in split
    if len(p) > 1:
        entity = p[0]
        content = " ".join([e.strip() for e in p[1].split('.')])

'https://sprs.parl.gov.sg/search/topic?reportid=025_20111021_S0008_T0007'



old API v2


['Mr Patrick Tay Teck Guan asked the Deputy Prime Minister and Minister for Manpower what is the breakdown of Singaporean, permanent resident and foreign Professionals, Managers and Executives in Singapore from 2006 to 2010 according to the various industries/sectors. ',
 'Mr Tharman Shanmugaratnam: The breakdown of Singaporean, permanent resident and foreign Professionals, Managers and Executives in Singapore from 2006 to 2010 according to the various industries/sectors are provided in Table 1 below.']

In [17]:
parse_hansard_text(df.iloc[8270].article_text)

['Mr Patrick Tay Teck Guan asked the Deputy Prime Minister and Minister for Manpower what is the breakdown of Singaporean, permanent resident and foreign Professionals, Managers and Executives in Singapore from 2006 to 2010 according to the various industries/sectors. ',
 'Mr Tharman Shanmugaratnam: The breakdown of Singaporean, permanent resident and foreign Professionals, Managers and Executives in Singapore from 2006 to 2010 according to the various industries/sectors are provided in Table 1 below.']

In [18]:
df.iloc[8400].article_text

'<span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'"><p>\xa0\xa0\xa0\xa0 15.\xa0<b>Dr Lim Wee Kiak</b> asked the Minister for Transport following the recent errors in calculation of distances travelled by buses (a) how many commuters or EZ-</p><p align="left">Column: 2215</p><p>\r\nLink card holders were affected; (b) whether there is a more convenient alternative for less literate commuters such as senior citizens, who\xa0may\xa0not\xa0 know how to use the ticketing\xa0machine to receive their refunds; and (c) what are the lessons learnt from this incident and how can we prevent it from occurring again.</p>\n<p>\xa0</p>\n</span></div>\n<!--MP_NAME:The Minister for Transport (Mr Raymond Lim Siang Keat)-->\n<div align="left"><span style="FONT-SIZE: 13pt; FONT-FAMILY: \'Times New Roman\'"><p>\xa0\xa0\xa0\xa0 <b>The Minister for Transport (Mr Raymond Lim Siang Keat)</b>: Sir, the LTA and the public transport operators have completed a thorough review of all 6,600 bus stop pair

In [19]:
parse_hansard_text(df.iloc[8400].article_text)

['Dr Lim Wee Kiak asked the Minister for Transport following the recent errors in calculation of distances travelled by buses (a) how many commuters or EZ- Link card holders were affected; (b) whether there is a more convenient alternative for less literate commuters such as senior citizens, who may not  know how to use the ticketing machine to receive their refunds; and (c) what are the lessons learnt from this incident and how can we prevent it from occurring again.',
 "The Minister for Transport (Mr Raymond Lim Siang Keat): Sir, the LTA and the public transport operators have completed a thorough review of all 6,600 bus stop pairs in the system. This review was undertaken after LTA found distance discrepancies in 13 bus stop pairs after the launch of distance fares on 3rd July 2010.In this review, distance discrepancies were found in 68 bus stop pairs (about 1% of total bus stop pairs) which led to overcharging, and 30 bus stop pairs (about 0.45%) which led to undercharging. On a we

In [20]:
get_chunks_info(df.iloc[8400].article_text,returns=False,verbose=True)

Number of Speech Chunks : 6
Word Count : 643 words
Estmated reading time : 2 minutes
Number of Individuals :  6
Speakers :  {'The Minister for Transport (Mr Raymond Lim Siang Keat)', 'Mr Raymond Lim Siang Keat', 'NA', 'Dr Lim Wee Kiak'}


## Parse our cleaning function

In [77]:
%time df['html_clean'] = df.article_text.map(parse_hansard_text)

CPU times: user 3min 2s, sys: 7.55 s, total: 3min 9s
Wall time: 4min 56s


In [27]:
df.iloc[8398].html_clean[1:2]

['The Second Minister for Finance (Mrs Lim Hwee Hua) (for the Minister for Finance): Sir, the two Integrated Resorts (IRs) collect casino entry levies on behalf of the Totalisator Board.They also pay betting taxes and GST on their business receipts to the Government. However, the entry of the IRs has also led to a restructuring of the gaming industry as a whole.Hence, while the IRs have brought in new revenues, collections from other gaming activities such as lotteries, horse and sports betting, and fruit machines operated by clubs have fallen.Taken together, the net increase in collections by the Totalisator Board with the entry of the IRs was about $130 million in the eight months between April and November 2010.Likewise, the net increase in revenues to the Government with the entry of the IRs was about $420 million over the same period.The Totalisator Board will pool collections from the casino entry levies with their recurrent surpluses, which go towards funding activities benefiti

## Joining our various chunks

In [78]:
%time df['cleaned_join'] = df.html_clean.map(lambda x : "<br/>".join(x))

CPU times: user 113 ms, sys: 494 ms, total: 608 ms
Wall time: 1.21 s


In [32]:
df.iloc[8398].cleaned_join[:400]

'Ms Denise Phua Lay Peng asked the Minister for Finance (a) how much additional revenue will the Government be receiving as a result of the good financial performance of the two Integrated Resorts; (b) what is the breakdown of the revenue attributable to the casinos and other operational units; and (c) what are the likely ways in which this additional revenue will be utilised.<br/>The Second Minist'

In [33]:
df.head()

Unnamed: 0,parliament_num,volume_num,sitting_num,session_num,session_type,article_text,title,sitting_date,src_url,html_clean,cleaned_join
0,13,94,105,2,SECOND READING BILLS,<p>[(proc text) Debate resumed. (proc text)]</...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,2019-05-08,https://sprs.parl.gov.sg/search/sprs3topic?rep...,"[[(proc text) Debate resumed. (proc text)], Mr...",[(proc text) Debate resumed. (proc text)]<br/>...
1,13,94,105,2,ORAL ANSWERS,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...",STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,2019-05-08,https://sprs.parl.gov.sg/search/sprs3topic?rep...,[Mr Vikram Nair asked the Minister for Foreign...,Mr Vikram Nair asked the Minister for Foreign ...
2,13,94,105,2,ORAL ANSWERS,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...",COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,2019-05-08,https://sprs.parl.gov.sg/search/sprs3topic?rep...,[Assoc Prof Walter Theseira asked the Minister...,Assoc Prof Walter Theseira asked the Minister ...
3,13,94,105,2,ORAL ANSWERS,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,2019-05-08,https://sprs.parl.gov.sg/search/sprs3topic?rep...,[Ms Irene Quay Siew Ching asked the Minister f...,Ms Irene Quay Siew Ching asked the Minister fo...
4,13,94,105,2,ORAL ANSWERS,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...",LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,2019-05-08,https://sprs.parl.gov.sg/search/sprs3topic?rep...,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,Mr Lim Biow Chuan asked the Deputy Prime Minis...


## Import into DocDB (Mongo)

We use MongoDB for production as it will allow for quicker document retrieval in the future.

In [105]:
import pymongo 

In [13]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["parliament"]
articles = db["articles"]

In [34]:
x = articles.insert_many(df.to_dict('records'))
print(client.list_database_names())

['admin', 'config', 'local', 'parliament']


In [35]:
print(db.list_collection_names())

['articles']


In [107]:
articles.find_one().keys()

dict_keys(['_id', 'parliament_num', 'volume_num', 'sitting_num', 'session_num', 'session_type', 'article_text', 'title', 'sitting_date', 'src_url', 'html_clean', 'cleaned_join'])

In [37]:
print(articles.find_one()['cleaned_join'][:400])

[(proc text) Debate resumed. (proc text)]<br/>Mr Deputy Speaker: Mr Png Eng Huat.6.57 pm<br/>Mr Png Eng Huat (Hougang): Sir, if this Bill is drafted solely to target the perpetrators of hate, violence, intolerance and disdain towards another race or religion, by all means, let us do it. We should never allow such people to propagate their vile ideologies in any form or manner. This world has no pl


In [7]:
mongo_df = pd.DataFrame.from_records(articles.find())

In [8]:
mongo_df.head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,"{""0"": {""entity"": ""NA"", ""content"": ""[(proc text...",[(proc text) Debate resumed. (proc text)]<br/>...,Society,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[Mr Leon Perera, Mr K Shanmugam, Assoc Prof Wa...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Vikram N...",Mr Vikram Nair asked the Minister for Foreign ...,Society,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[Dr Vivian Balakrishnan, The Minister for Fore...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...","{""0"": {""entity"": ""NA"", ""content"": ""Assoc Prof ...",Assoc Prof Walter Theseira asked the Minister ...,Internal Security,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Ms Anthea Ong, Assoc Prof Wa...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,"{""0"": {""entity"": ""NA"", ""content"": ""Ms Irene Qu...",Ms Irene Quay Siew Ching asked the Minister fo...,Environment,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Dr L...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Lim Biow...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,Employment,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, The Second Minister for Fi...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


## Adding Entities

In [48]:
from text_utils import cleaning 
from text_utils import metrics
from bson import json_util

In [83]:
sample = metrics.get_chunks_info(mongo_df['article_text'][8400],returns=True,verbose=False)
sample[0]

{'entity': 'NA',
 'content': 'Dr Lim Wee Kiak asked the Minister for Transport following the recent errors in calculation of distances travelled by buses (a) how many commuters or EZ- Link card holders were affected; (b) whether there is a more convenient alternative for less literate commuters such as senior citizens, who may not  know how to use the ticketing machine to receive their refunds; and (c) what are the lessons learnt from this incident and how can we prevent it from occurring again.'}

In [36]:
list(set([re.sub(r'(\(.*?\))','',entity['entity']).strip() for entity in sample.values()\
                     if entity['entity'] != 'NA']))

['The Minister for Transport', 'Mr Raymond Lim Siang Keat', 'Dr Lim Wee Kiak']

In [38]:
entity = cleaning.get_entities(sample)
entity

['The Minister for Transport', 'Mr Raymond Lim Siang Keat', 'Dr Lim Wee Kiak']

In [87]:
%time mongo_df['chunks'] = mongo_df.article_text.map( lambda x : get_chunks_info(x,verbose=False))

CPU times: user 2min 38s, sys: 4.06 s, total: 2min 42s
Wall time: 3min 28s


In [88]:
%time mongo_df['parsed_convo'] = mongo_df.chunks.map(lambda x : cleaning.parse_topics(x))

CPU times: user 233 ms, sys: 356 ms, total: 589 ms
Wall time: 604 ms


In [89]:
%time mongo_df['persons_involved'] = mongo_df.chunks.map(lambda x : cleaning.get_entities(x))

CPU times: user 87.6 ms, sys: 4.15 ms, total: 91.7 ms
Wall time: 91.7 ms


In [90]:
%time mongo_df['persons_involved'] = mongo_df.persons_involved.map(lambda x : [item for item in x if len(item)<40])

CPU times: user 17.1 ms, sys: 1.58 ms, total: 18.7 ms
Wall time: 19.8 ms


In [93]:
mongo_df.head()

Unnamed: 0,_id,article_text,cleaned_join,html_clean,parliament_num,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num,chunks,parsed_convo,persons_involved
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,[(proc text) Debate resumed. (proc text)]<br/>...,"[[(proc text) Debate resumed. (proc text)], Mr...",13,2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94,"{0: {'entity': 'NA', 'content': '[(proc text) ...",[{'content': '[(proc text) Debate resumed. (pr...,"[Mr Leon Perera, Mr K Shanmugam, Assoc Prof Wa..."
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...",Mr Vikram Nair asked the Minister for Foreign ...,[Mr Vikram Nair asked the Minister for Foreign...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94,"{0: {'entity': 'NA', 'content': 'Mr Vikram Nai...",[{'content': 'Mr Vikram Nair asked the Ministe...,"[Dr Vivian Balakrishnan, The Minister for Fore..."
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...",Assoc Prof Walter Theseira asked the Minister ...,[Assoc Prof Walter Theseira asked the Minister...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94,"{0: {'entity': 'NA', 'content': 'Assoc Prof Wa...",[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Ms Anthea Ong, Assoc Prof Wa..."
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,Ms Irene Quay Siew Ching asked the Minister fo...,[Ms Irene Quay Siew Ching asked the Minister f...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94,"{0: {'entity': 'NA', 'content': 'Ms Irene Quay...",[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Dr L..."
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94,"{0: {'entity': 'NA', 'content': 'Mr Lim Biow C...",[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, The Second Minister for Fi..."


In [108]:
mongo_df['_id'][0]

ObjectId('5d27eca6172d9aa762d4802f')

In [113]:
query = { "_id": mongo_df['_id'][0] }
value = { "$set": {"chunks": json_util.dumps(mongo_df['chunks'][0]),
                   "parsed_convo" : mongo_df['parsed_convo'][0],
                   "persons_involved" : mongo_df['persons_involved'][0]}}


articles.update_one(query,value)

<pymongo.results.UpdateResult at 0x228867208>

In [127]:
for row in tqdm(mongo_df.itertuples()):
    query = { "_id" : row._1}
    value = { "$set": { "chunks": json_util.dumps(row.chunks),
                       "parsed_convo" : row.parsed_convo,
                       "persons_involved" : row.persons_involved}}
    articles.update_one(query,value)

9531it [00:24, 386.30it/s]


In [128]:
pd.DataFrame.from_records(articles.find()).head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,"{""0"": {""entity"": ""NA"", ""content"": ""[(proc text...",[(proc text) Debate resumed. (proc text)]<br/>...,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[Mr Leon Perera, Mr K Shanmugam, Assoc Prof Wa...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Vikram N...",Mr Vikram Nair asked the Minister for Foreign ...,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[Dr Vivian Balakrishnan, The Minister for Fore...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...","{""0"": {""entity"": ""NA"", ""content"": ""Assoc Prof ...",Assoc Prof Walter Theseira asked the Minister ...,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Ms Anthea Ong, Assoc Prof Wa...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,"{""0"": {""entity"": ""NA"", ""content"": ""Ms Irene Qu...",Ms Irene Quay Siew Ching asked the Minister fo...,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Dr L...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Lim Biow...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, The Second Minister for Fi...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


In [115]:
new_df = pd.DataFrame.from_records(articles.find()).head()

In [126]:
new_df.iloc[0].chunks[:200]

'{"0": {"entity": "NA", "content": "[(proc text) Debate resumed. (proc text)]"}, "1": {"entity": "Mr Deputy Speaker", "content": " Mr Png Eng Huat.6.57 pm"}, "2": {"entity": "Mr Png Eng Huat (Hougang)"'