### Imports

In [None]:
# Import the libraries
import gensim
from glob import glob
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

np = pd.np

In [None]:
# Load data for text processing

#For removing punctuation
table = str.maketrans('', '', string.punctuation)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#For Displaying progress
tqdm_notebook(disable = True).pandas()

In [None]:
'''
    Reads articles from the given path.
    path : path format from which files have to be read. e.g. ./*.cvs will read all csvs
    Params:
    show_progress : Shows the progress using tqdm if True else nothing displayed
'''
def read_articles(path, show_progress = True):
    df_list = []
    for file_name in tqdm_notebook(glob(path), disable = not show_progress):
            temp_df = pd.read_csv(file_name, index_col=0)
            temp_df["date"] = file_name.split("/")[-1].split('.')[0]
            df_list.append(temp_df)
    df = pd.concat(df_list, ignore_index=True)
    df["date"] = pd.to_datetime(df["date"])
    return df
'''
    For the given text, returns a list of words representing the text
    with all words in lower case and punctuation along with stopwords 
    removed
    Params:
    text : Text for which vocubulary has to be generated
'''
def generate_document_vocabulary(text):
    vocabulary = []
    for word in word_tokenize(text):
        w = word.translate(table).lower()
        if w.isalpha() and w not in stop_words:
            vocabulary.append(w)
    return vocabulary

### All the CSVs contain the following data
date, title(headline), location, text(full article)

### Data pre-processing for doc2vec

In [None]:
# Read articles
df = read_articles("../data/TOI/*.csv")
# Take training data (until 1-Jan-2019)
df = df[df["date"] < pd.to_datetime("1-Jan-2019")]

In [None]:
# Get the vocabulary from the given text
df['vocabulary'] = df['text'].progress_apply(generate_document_vocabulary)

In [None]:
#Convert the vocabulary into Tagged document for doc2vec model
documents = []
for i, row in df.iterrows():
    document = TaggedDocument(row['vocabulary'], [i])
    documents.append(document)

### Training the model

In [None]:
max_epochs = 100
vec_size = 50
alpha = 0.025

# Distributed memory model
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1,
                workers=8)

In [None]:
# Initialize the model
model.build_vocab(documents)

In [None]:
# Train the model
for _ in tqdm_notebook(range(max_epochs)):
    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.epochs,)
    # Deacying learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

In [None]:
# Save the model
model.save("article.d2v")

### Generate the document dictionary that can be used to access a document by tag

In [None]:
# Generate the document dictionary that can be used to access a document by tag
document_dic = {}
for doc,tag in documents:
    document_dic[tag[0]] = doc

### Test the model for Random Data from ACLED after Jan-1-2019

In [None]:
article = "On July 15, a long protest march by farmers, from Mandsaur in Madhya Pradesh to New Delhi, demanding loan waiver and fair price for their produce, reached Jaipur."
article = ' '.join(generate_document_vocabulary(article))

### Find the closest Document


In [None]:
# Reference : https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

# find the vector
vec = model.infer_vector(sabri)

#find 10 closest documents
sims = model.docvecs.most_similar([vec])

In [None]:
# Print the document
for doc_tag,score in sims:
    print("Document has score : "score, "\nContent : " document_dic[doc_tag] + "\n\n")