In [1]:
import pymongo
import pandas as pd
import numpy as np
import json

from tqdm import tqdm
from bson import json_util 

from bs4 import BeautifulSoup
import unicodedata
import re
from text_utils import cleaning
from text_utils.metrics import get_chunks_info

## Load Data from DB

In [2]:
import os
import yaml

with open("config.yaml", "r") as ymlfile:
    settings = yaml.safe_load(ymlfile)


isRemote = False


try:
    if (isRemote):
        client = pymongo.MongoClient(host = settings['mongo-remote']['host'],
                                     port = settings['mongo-remote']['port'],
                                     username = settings['mongo-remote']['user'],
                                     password = settings['mongo-remote']['pw'],
                                     authMechanism= settings['mongo-remote']['authMechanism'])
        client.server_info()  # force connection on a request as the
        # connect=True parameter of MongoClient seems
        # to be useless here
    else:
        client = pymongo.MongoClient("mongodb://localhost:27017/")
        client.server_info()  # force connection on a request as the
    # connect=True parameter of MongoClient seems
    # to be useless here
except pymongo.errors.ServerSelectionTimeoutError as err:
    # do whatever you need
    print(err)


In [3]:
db = client["parliament"]
articles = db["articles"]

In [4]:
mongo_df = pd.DataFrame.from_records(articles.find())
mongo_df.head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,[{'content': '[(proc text) Debate resumed. (pr...,[(proc text) Debate resumed. (proc text)]<br/>...,Society,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[The Minister for Law, Mr Darryl David, Mr Lou...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...",[{'content': 'Mr Vikram Nair asked the Ministe...,Mr Vikram Nair asked the Minister for Foreign ...,Society,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[The Minister for Foreign Affairs, Mr Vikram N...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...",[{'content': 'Assoc Prof Walter Theseira asked...,Assoc Prof Walter Theseira asked the Minister ...,Internal Security,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Assoc Prof Walter Theseira, ...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,[{'content': 'Ms Irene Quay Siew Ching asked t...,Ms Irene Quay Siew Ching asked the Minister fo...,Environment,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Ms I...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...",[{'content': 'Mr Lim Biow Chuan asked the Depu...,Mr Lim Biow Chuan asked the Deputy Prime Minis...,Employment,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, Mr Lim Biow Chuan, The Sec...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


In [None]:
%time mongo_df['chunks'] = mongo_df.article_text.map( lambda x : get_chunks_info(x,verbose=False))

In [None]:
%time mongo_df['parsed_convo'] = mongo_df.chunks.map(lambda x : cleaning.parse_topics(x))

In [None]:
%time mongo_df['html_clean'] = mongo_df.article_text.map(cleaning.parse_hansard_text)

In [None]:
%time mongo_df['cleaned_join'] = mongo_df.html_clean.map(lambda x : "<br/>".join(x))

In [None]:
%time mongo_df['persons_involved'] = mongo_df.chunks.map(lambda x : cleaning.get_entities(x))

In [None]:
for row in tqdm(mongo_df.itertuples()):
    query = { "_id" : row._1}
    value = { "$set": { "chunks": json_util.dumps(row.chunks),
                       "parsed_convo" : row.parsed_convo,
                       "persons_involved" : row.persons_involved,
                      "html_clean": row.html_clean,
                      "cleaned_join": row.cleaned_join}}
    articles.update_one(query,value)

In [None]:
mongo_df = pd.DataFrame.from_records(articles.find())
mongo_df.head()