# Doc2Vec Implementation on Corpus

## Import Libraries

In [27]:
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import pymongo
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

stop_words = stopwords.words('english')


## custome stop_words
stop_words.extend(['br','mr'])

## Read in Data from MongoDB

In [4]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

db = client["parliament"]
articles = db["articles"]

In [5]:
mongo_df = pd.DataFrame.from_records(articles.find())
mongo_df.head()

Unnamed: 0,_id,article_text,chunks,cleaned_join,dominant_topic,html_clean,parliament_num,parsed_convo,persons_involved,session_num,session_type,sitting_date,sitting_num,src_url,title,volume_num
0,5d27eca6172d9aa762d4802f,<p>[(proc text) Debate resumed. (proc text)]</...,"{""0"": {""entity"": ""NA"", ""content"": ""[(proc text...",[(proc text) Debate resumed. (proc text)]<br/>...,Society,"[[(proc text) Debate resumed. (proc text)], Mr...",13,[{'content': '[(proc text) Debate resumed. (pr...,"[Mr Leon Perera, Mr K Shanmugam, Assoc Prof Wa...",2,SECOND READING BILLS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,PROTECTION FROM ONLINE FALSEHOODS AND MANIPULA...,94
1,5d27eca6172d9aa762d48030,"<p class=""ql-align-justify"">4 <strong>Mr Vikra...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Vikram N...",Mr Vikram Nair asked the Minister for Foreign ...,Society,[Mr Vikram Nair asked the Minister for Foreign...,13,[{'content': 'Mr Vikram Nair asked the Ministe...,"[Dr Vivian Balakrishnan, The Minister for Fore...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,STATE OF BILATERAL RELATIONS WITH MALAYSIA FOL...,94
2,5d27eca6172d9aa762d48031,"<p class=""ql-align-justify"">8 <strong>Assoc Pr...","{""0"": {""entity"": ""NA"", ""content"": ""Assoc Prof ...",Assoc Prof Walter Theseira asked the Minister ...,Internal Security,[Assoc Prof Walter Theseira asked the Minister...,13,[{'content': 'Assoc Prof Walter Theseira asked...,"[Ms Low Yen Ling, Ms Anthea Ong, Assoc Prof Wa...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,COMPANIES WITH MEASURES TO DEAL WITH WORKPLACE...,94
3,5d27eca6172d9aa762d48032,<p>5 <strong>Ms Irene Quay Siew Ching</strong>...,"{""0"": {""entity"": ""NA"", ""content"": ""Ms Irene Qu...",Ms Irene Quay Siew Ching asked the Minister fo...,Environment,[Ms Irene Quay Siew Ching asked the Minister f...,13,[{'content': 'Ms Irene Quay Siew Ching asked t...,"[The Senior Minister of State for Health, Dr L...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,REVIEW OF DRUG TESTING STANDARDS IN SINGAPORE ...,94
4,5d27eca6172d9aa762d48033,"<p class=""ql-align-justify"">2 <strong>Mr Lim B...","{""0"": {""entity"": ""NA"", ""content"": ""Mr Lim Biow...",Mr Lim Biow Chuan asked the Deputy Prime Minis...,Employment,[Mr Lim Biow Chuan asked the Deputy Prime Mini...,13,[{'content': 'Mr Lim Biow Chuan asked the Depu...,"[Ms Indranee Rajah, The Second Minister for Fi...",2,ORAL ANSWERS,2019-05-08,105,https://sprs.parl.gov.sg/search/sprs3topic?rep...,LIVING IN PRIVATE PROPERTIES BUT WITH NO DECLA...,94


## Text Preprocessing

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
    
def get_corpus(df):
    words = list(sent_to_words(df))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

def process_doc(doc):
    words = sent_to_words(doc)
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    
    return bigram_mod

In [16]:
train_doc, test_doc = train_test_split(mongo_df)

In [9]:
process_doc(train_doc)

<gensim.models.phrases.Phraser at 0x1708bcc88>

In [10]:
[word for word in simple_preprocess(str(train_doc[2])) if word not in stop_words][:4]

['assoc', 'prof', 'walter', 'theseira']

In [65]:
def read_corpus(series_docs, tokens_only=False):
    for line in series_docs.itertuples():
        if tokens_only:
            yield simple_preprocess(line.cleaned_join)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(simple_preprocess(line.cleaned_join), tags=str(line._1))


In [25]:
train_corpus = list(read_corpus(train_doc, tokens_only=False))
test_corpus = list(read_corpus(test_doc, tokens_only=True))

In [66]:
corpus = list(read_corpus(mongo_df,tokens_only=False))

In [108]:
corpus[1].tags

'5d27eca6172d9aa762d48030'

## Instantiating a Doc2Vec GenSim Instance

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=5, epochs=100)

### Build a Vocabulary

In [69]:
model.build_vocab(corpus)

In [73]:
%time model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 17h 33min 52s, sys: 9min 40s, total: 17h 43min 32s
Wall time: 6h 48min 22s


### Save Model

In [74]:
model.save('doc2vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
model.delete_temporary_training_data()

## Load Model

In [122]:
model = gensim.models.doc2vec.Doc2Vec.load('doc2vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Inference

In [104]:
corpus[291].words[:10]

['the',
 'chairman',
 'head',
 'ministry',
 'of',
 'home',
 'affairs',
 'mr',
 'christopher',
 'de']

In [93]:
inference_1 = model.infer_vector(corpus[291].words)

In [100]:
corpus[17].tags

'5d27eca6172d9aa762d48040'

In [109]:
model.docvecs.most_similar(str(corpus[291].tags))

TypeError: '<' not supported between instances of 'str' and 'int'

In [88]:
" ".join(train_corpus[].words)

TypeError: list indices must be integers or slices, not str

In [78]:
corpus[9][1]

'5d27eca6172d9aa762d48038'