## Named Entity Recognition

SpaCy named entity recognition

In [45]:
import datetime
import numpy as np
import pandas as pd
from datetime import date 

from text_utils import metrics
import re

import pymongo 
import spacy
import en_core_web_lg

In [46]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["parliament"]
articles = db["articles"]

In [47]:
articles.find_one().keys()

dict_keys(['_id', 'parliament_num', 'volume_num', 'sitting_num', 'session_num', 'session_type', 'article_text', 'title', 'sitting_date', 'src_url', 'html_clean', 'cleaned_join'])

In [48]:
articles.find_one()['cleaned_join'][:400]

'[(proc text) Debate resumed. (proc text)]<br/>Mr Deputy Speaker: Mr Png Eng Huat.6.57 pm<br/>Mr Png Eng Huat (Hougang): Sir, if this Bill is drafted solely to target the perpetrators of hate, violence, intolerance and disdain towards another race or religion, by all means, let us do it. We should never allow such people to propagate their vile ideologies in any form or manner. This world has no pl'

In [49]:
mongo_df = pd.DataFrame.from_records(articles.find())

In [50]:
mongo_df.head()

Unnamed: 0,_id,article_text,cleaned_join,html_clean,parliament_num,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d25dd11c63e4c829673ceac,<p>[(proc text) Debate resumed. (proc text)]</...,[(proc text) Debate resumed. (proc text)]<br/>...,"[[(proc text) Debate resumed. (proc text)], Mr...",13,2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d25dd11c63e4c829673cead,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...",Mr Vikram Nair asked the Minister for Foreign ...,[Mr Vikram Nair asked the Minister for Foreign...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d25dd11c63e4c829673ceae,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...",Assoc Prof Walter Theseira asked the Minister ...,[Assoc Prof Walter Theseira asked the Minister...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d25dd11c63e4c829673ceaf,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,Ms Irene Quay Siew Ching asked the Minister fo...,[Ms Irene Quay Siew Ching asked the Minister f...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d25dd11c63e4c829673ceb0,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


## SpaCy NER Implementation

In [12]:
spacy_nlp = en_core_web_lg.load()
document = spacy_nlp(mongo_df['cleaned_join'][8])

print('Original Sentence: \n%s' % (document))
print('\nIdentified Entities:')
for element in document.ents:
    print('Type: %s, Value: %s' % (element.label_, element)) 
    
spacy.displacy.render(document, style="ent")

Original Sentence: 
Ms Irene Quay Siew Ching asked the Minister for Education with regard to the post-Secondary admission system to be implemented in 2028 (a) how does the new subject banding determine subsequent admission to post-Secondary institutions of choice; and (b) how can it be ensured that a student chooses subject bands based on his ability and talent and not influenced primarily by entry criteria of his favoured post-Secondary institutions.<br/>The Second Minister for Education (Ms Indranee Rajah) (for the Minister for Education): Mr Speaker, as announced earlier during the COS debate, MOE will study how the post-Secondary posting system should be adjusted to complement the roll-out of Full SBB.We have some time to work out the details, and will announce them in good time. But the general thrust is to build upon ongoing efforts to recognise the strengths and talents of students.For example, we have the Polytechnic Foundation Programme (PFP) today, where students who have don

In [13]:
nlp = spacy.load('en_core_web_sm')
document = nlp(mongo_df['cleaned_join'][8])

print('Original Sentence: \n%s' % (document))
print('\nIdentified Entities:')
for element in document.ents:
    print('Type: %s, Value: %s' % (element.label_, element)) 
    
spacy.displacy.render(document, style="ent")

Original Sentence: 
Ms Irene Quay Siew Ching asked the Minister for Education with regard to the post-Secondary admission system to be implemented in 2028 (a) how does the new subject banding determine subsequent admission to post-Secondary institutions of choice; and (b) how can it be ensured that a student chooses subject bands based on his ability and talent and not influenced primarily by entry criteria of his favoured post-Secondary institutions.<br/>The Second Minister for Education (Ms Indranee Rajah) (for the Minister for Education): Mr Speaker, as announced earlier during the COS debate, MOE will study how the post-Secondary posting system should be adjusted to complement the roll-out of Full SBB.We have some time to work out the details, and will announce them in good time. But the general thrust is to build upon ongoing efforts to recognise the strengths and talents of students.For example, we have the Polytechnic Foundation Programme (PFP) today, where students who have don

### Stanford NER

In [14]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize


jar = '/users/syamil/Downloads/stanford-ner/stanford-ner.jar'
model = '/users/syamil/Downloads/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'

st = StanfordNERTagger(model,
					   jar,
					   encoding='utf-8')

text = mongo_df['cleaned_join'][8]

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

[('Ms', 'O'), ('Irene', 'PERSON'), ('Quay', 'PERSON'), ('Siew', 'PERSON'), ('Ching', 'PERSON'), ('asked', 'O'), ('the', 'O'), ('Minister', 'ORGANIZATION'), ('for', 'ORGANIZATION'), ('Education', 'ORGANIZATION'), ('with', 'O'), ('regard', 'O'), ('to', 'O'), ('the', 'O'), ('post-Secondary', 'O'), ('admission', 'O'), ('system', 'O'), ('to', 'O'), ('be', 'O'), ('implemented', 'O'), ('in', 'O'), ('2028', 'O'), ('(', 'O'), ('a', 'O'), (')', 'O'), ('how', 'O'), ('does', 'O'), ('the', 'O'), ('new', 'O'), ('subject', 'O'), ('banding', 'O'), ('determine', 'O'), ('subsequent', 'O'), ('admission', 'O'), ('to', 'O'), ('post-Secondary', 'O'), ('institutions', 'O'), ('of', 'O'), ('choice', 'O'), (';', 'O'), ('and', 'O'), ('(', 'O'), ('b', 'O'), (')', 'O'), ('how', 'O'), ('can', 'O'), ('it', 'O'), ('be', 'O'), ('ensured', 'O'), ('that', 'O'), ('a', 'O'), ('student', 'O'), ('chooses', 'O'), ('subject', 'O'), ('bands', 'O'), ('based', 'O'), ('on', 'O'), ('his', 'O'), ('ability', 'O'), ('and', 'O'), ('ta

We observe that SpaCy's performance is much better than StanfordNER. The next plan is to improve upon the existing SpaCy model to be able to detect keywords better.

`get_chunks_info` : takes in a bunch of chunks and outputs useful information.

`ReadTime` : https://help.medium.com/hc/en-us/articles/214991667-Read-time

> Read time is based on the average reading speed of an adult (roughly 265 WPM). We take the total word count of a post and translate it into minutes, with an adjustment made for images. For posts in Chinese, Japanese and Korean, it's a function of number of characters (500 characters/min) with an adjustment made for images.

In [7]:
from text_utils.cleaning import parse_hansard_text

In [8]:
mongo_df['html_clean'][8]

['Ms Irene Quay Siew Ching asked the Minister for Education with regard to the post-Secondary admission system to be implemented in 2028 (a) how does the new subject banding determine subsequent admission to post-Secondary institutions of choice; and (b) how can it be ensured that a student chooses subject bands based on his ability and talent and not influenced primarily by entry criteria of his favoured post-Secondary institutions.',
 'The Second Minister for Education (Ms Indranee Rajah) (for the Minister for Education): Mr Speaker, as announced earlier during the COS debate, MOE will study how the post-Secondary posting system should be adjusted to complement the roll-out of Full SBB.We have some time to work out the details, and will announce them in good time. But the general thrust is to build upon ongoing efforts to recognise the strengths and talents of students.For example, we have the Polytechnic Foundation Programme (PFP) today, where students who have done well in the Norm

## Implement Entity recognition via rules

In [9]:
mongo_df['article_text'][10]

"<p>11 <strong>Mr Lim Biow Chuan</strong> asked&nbsp;the Minister for National Development (a) in the past three years, how many summons have been issued to offenders who feed pigeons; and (b) what other measures can the Ministry introduce to deter the feeding of pigeons.</p><p><strong>\tThe Senior Parliamentary Secretary to the Minister for National Development (Ms Sun Xueling) (for the Minister for National Development)</strong>:&nbsp;Over the past three years, 682 enforcement notices were issued for pigeon feeding offences.&nbsp;NParks takes measures to deter the feeding of pigeons, including installing cameras and conducting surveillance at identified feeding hotspots.&nbsp;NParks also works with Town Councils to put up notices to ask for information about feeding activities, so as to carry out enforcement operations in a more targeted manner.</p><p>Public education is also an important strategy.&nbsp;NParks, NEA and the Town Councils work together to educate residents about the en

In [10]:
mongo_df['src_url'][10]

'https://sprs.parl.gov.sg/search/sprs3topic?reportid=oral-answer-1994'

In [8]:
t = metrics.get_chunks_info(mongo_df['article_text'][10])

Number of Speech Chunks : 8
Word Count : 1303 words
Estmated reading time : 4 minutes
Number of Individuals :  8
Speakers :  {'NA', 'Mr Lim Biow Chuan (Mountbatten)', 'Ms Sun Xueling', 'The Senior Parliamentary Secretary to the Minister for National Development (Ms Sun Xueling) (for the Minister for National Development)', 'Mr Louis Ng Kok Kwang (Nee Soon)'}


In [23]:
data = t
question = []
segment = []
for key,data in data.items():
    chunk = {}
    chunk['content'] = ''
    if data['entity'] == 'NA':
        chunk['type'] = 'question'
        chunk['content'] = data['content']
        segment.append(chunk)
    else:
        chunk['type'] = 'response'
        chunk['content'] = data['entity'] + ':' + data['content']
        segment.append(chunk)
        
responses = ""
for val in segment[1:]:
    responses += '\n\n' + val['content']
    
compiled_responses = {
    'type' : 'compiled_responses',
    'content' : responses
}

segment.append(compiled_responses)

In [26]:
segment[:3]

[{'content': 'Mr Lim Biow Chuan asked the Minister for National Development (a) in the past three years, how many summons have been issued to offenders who feed pigeons; and (b) what other measures can the Ministry introduce to deter the feeding of pigeons.',
  'type': 'question'},
 {'content': 'The Senior Parliamentary Secretary to the Minister for National Development (Ms Sun Xueling) (for the Minister for National Development): Over the past three years, 682 enforcement notices were issued for pigeon feeding offences. NParks takes measures to deter the feeding of pigeons, including installing cameras and conducting surveillance at identified feeding hotspots. NParks also works with Town Councils to put up notices to ask for information about feeding activities, so as to carry out enforcement operations in a more targeted manner.Public education is also an important strategy. NParks, NEA and the Town Councils work together to educate residents about the environmental health and hygie

In [28]:
from text_utils.cleaning import get_entities

In [51]:
mongo_df['html_clean'][10]

['Mr Lim Biow Chuan asked the Minister for National Development (a) in the past three years, how many summons have been issued to offenders who feed pigeons; and (b) what other measures can the Ministry introduce to deter the feeding of pigeons.',
 'The Senior Parliamentary Secretary to the Minister for National Development (Ms Sun Xueling) (for the Minister for National Development): Over the past three years, 682 enforcement notices were issued for pigeon feeding offences. NParks takes measures to deter the feeding of pigeons, including installing cameras and conducting surveillance at identified feeding hotspots. NParks also works with Town Councils to put up notices to ask for information about feeding activities, so as to carry out enforcement operations in a more targeted manner.Public education is also an important strategy. NParks, NEA and the Town Councils work together to educate residents about the environmental health and hygiene issues caused by pigeon feeding. This is don