# Doc2Vec Implementation on Corpus

## Import Libraries

In [44]:
import gensim
from gensim.utils import simple_preprocess
import pymongo
import pandas as pd
import numpy as np

from bson import ObjectId
from sklearn.model_selection import train_test_split

## Read in Data from MongoDB

In [6]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["parliament"]
articles = db["articles"]

In [66]:
try:
    remote_client = pymongo.MongoClient('192.168.1.42',
                                 username='syamil',
                                 password='Rskfazw9..',
                                 authMechanism='SCRAM-SHA-256')
    remote_client.server_info() # force connection on a request as the
                         # connect=True parameter of MongoClient seems
                         # to be useless here 
except pymongo.errors.ServerSelectionTimeoutError as err:
    # do whatever you need
    print(err)


In [68]:
remote_db = remote_client["parliament"]
remote_articles = remote_db["articles"]

In [63]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["parliament"]
articles = db["articles"]

In [70]:
mongo_df = pd.DataFrame.from_records(remote_articles.find())
mongo_df.head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,"{""0"": {""entity"": ""NA"", ""content"": ""[(proc text...",[(proc text) Debate resumed. (proc text)]<br/>...,Society,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[Mr Leon Perera, Mr K Shanmugam, Assoc Prof Wa...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Vikram N...",Mr Vikram Nair asked the Minister for Foreign ...,Society,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[Dr Vivian Balakrishnan, The Minister for Fore...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...","{""0"": {""entity"": ""NA"", ""content"": ""Assoc Prof ...",Assoc Prof Walter Theseira asked the Minister ...,Internal Security,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Ms Anthea Ong, Assoc Prof Wa...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,"{""0"": {""entity"": ""NA"", ""content"": ""Ms Irene Qu...",Ms Irene Quay Siew Ching asked the Minister fo...,Environment,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Dr L...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Lim Biow...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,Employment,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, The Second Minister for Fi...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


## Text Preprocessing

In [14]:
def read_corpus(series_docs, tokens_only=False):
    for line in series_docs.itertuples():
        if tokens_only:
            yield simple_preprocess(line.cleaned_join)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(simple_preprocess(line.cleaned_join), tags=[str(line._1)])


In [16]:
corpus = list(read_corpus(mongo_df,tokens_only=False))

In [17]:
corpus[1].tags

['5d27eca6172d9aa762d48030']

## Instantiating a Doc2Vec GenSim Instance

In [18]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=3, epochs=100, workers=4)

### Build a Vocabulary

In [19]:
model.build_vocab(corpus)

In [20]:
%time model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 20min 22s, sys: 6.98 s, total: 20min 29s
Wall time: 6min 42s


### Save Model

In [21]:
model.save('doc2vec')

In [20]:
model.delete_temporary_training_data()

## Load Model

In [122]:
model = gensim.models.doc2vec.Doc2Vec.load('doc2vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Inference

In [53]:
inference_1 = model.infer_vector(corpus[291].words)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [73]:
results = model.docvecs.most_similar([inference_1])
display(results)

[('5d27eca6172d9aa762d48152', 0.992645263671875),
 ('5d27eca6172d9aa762d49ef0', 0.9119903445243835),
 ('5d27eca6172d9aa762d48b2d', 0.907822847366333),
 ('5d27eca6172d9aa762d48fc3', 0.8947123289108276),
 ('5d27eca6172d9aa762d492f7', 0.8905215859413147),
 ('5d27eca6172d9aa762d49304', 0.8838604688644409),
 ('5d27eca6172d9aa762d48639', 0.8791929483413696),
 ('5d27eca6172d9aa762d49673', 0.8630969524383545),
 ('5d27eca6172d9aa762d49aac', 0.8347380757331848),
 ('5d27eca6172d9aa762d48b5c', 0.8256769180297852)]

We can see that `most_similar` returns a list of tuples with the document ID and its respective probability.

In [100]:
mongo_df[mongo_df['_id'] == ObjectId('5d27eca6172d9aa762d48152')]

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
291,5d27eca6172d9aa762d48152,"<p><strong>The Chairman</strong>: Head P, Mini...","{""0"": {""entity"": ""The Chairman"", ""content"": "" ...","The Chairman: Head P, Ministry of Home Affairs...",Internal Security,"[The Chairman: Head P, Ministry of Home Affair...",13,"[{'content': 'The Chairman: Head P, Ministry o...","[Mrs Josephine Teo, Mr Yee Chia Hsing, Ms Jess...",2,BUDGET,2019-03-01,96,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMMITTEE OF SUPPLY - HEAD P (MINISTRY OF HOME...,94


In [101]:
def fetch_recommended_document(document_id,mongo_conn,model,n_results=5):
    """
    Fetch documents from mongoDB based on inference
    
    """
    document = mongo_conn.parliament.articles.find_one({'_id': ObjectId(document_id)})
    inference = model.infer_vector(document['cleaned_join'].split())
    results = model.docvecs.most_similar([inference])
    ids = []
    for item in results[:n_results]:
        ids.append(ObjectId(item[0]))
    
    recommends = []
    for recommend in mongo_conn.parliament.articles.find({"_id" : {"$in" : ids }}):
        recommends.append({
            "_id" : recommend["_id"],
            "title" : recommend["title"],
            "sitting_date" : recommend["sitting_date"],
            "session_type" : recommend["session_type"]
        })
    return recommends
    

In [102]:
fetch_recommended_document('5d27eca6172d9aa762d48b2d',remote_client,model,6)

[{'_id': ObjectId('5d27eca6172d9aa762d48152'),
  'title': 'COMMITTEE OF SUPPLY - HEAD P (MINISTRY OF HOME AFFAIRS)',
  'sitting_date': datetime.datetime(2019, 3, 1, 0, 0),
  'session_type': 'BUDGET'},
 {'_id': ObjectId('5d27eca6172d9aa762d48639'),
  'title': 'COMMITTEE OF SUPPLY – HEAD P (MINISTRY OF HOME AFFAIRS)',
  'sitting_date': datetime.datetime(2018, 3, 2, 0, 0),
  'session_type': 'MOTIONS'},
 {'_id': ObjectId('5d27eca6172d9aa762d48b2d'),
  'title': 'COMMITTEE OF SUPPLY − HEAD P (MINISTRY OF HOME AFFAIRS)',
  'sitting_date': datetime.datetime(2017, 3, 3, 0, 0),
  'session_type': 'MOTIONS'},
 {'_id': ObjectId('5d27eca6172d9aa762d48fc3'),
  'title': 'COMMITTEE OF SUPPLY – HEAD P (MINISTRY OF HOME AFFAIRS)',
  'sitting_date': datetime.datetime(2016, 4, 6, 0, 0),
  'session_type': 'MOTIONS'},
 {'_id': ObjectId('5d27eca6172d9aa762d492f7'),
  'title': 'HEAD P – MINISTRY OF HOME AFFAIRS (COMMITTEE OF SUPPLY)',
  'sitting_date': datetime.datetime(2015, 3, 6, 0, 0),
  'session_type': 'MO