In [1]:
import gensim
from glob import glob
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
#For removing punctuation
table = str.maketrans('', '', string.punctuation)
np = pd.np
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tqdm_notebook(disable = True).pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aashish_jain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def read_articles(path, show_progress = True):
    df_list = []
    for file_name in tqdm_notebook(glob(path), disable = not show_progress):
            temp_df = pd.read_csv(file_name, index_col=0)
            temp_df["date"] = file_name.split("/")[-1].split('.')[0]
            df_list.append(temp_df)
    df = pd.concat(df_list, ignore_index=True)
    df["date"] = pd.to_datetime(df["date"])
    return df

In [4]:
df = read_articles("../data/TOI/*.csv")
df = df[df["date"] < pd.to_datetime("1-Jan-2019")]

HBox(children=(IntProgress(value=0, max=1179), HTML(value='')))




In [5]:
def generate_document_vocabulary(text):
    vocabulary = []
    for word in word_tokenize(text):
        w = word.translate(table).lower()
        if w.isalpha() and w not in stop_words:
            vocabulary.append(w)
    return vocabulary

In [6]:
df['vocabulary'] = df['text'].progress_apply(generate_document_vocabulary)

HBox(children=(IntProgress(value=0, max=45406), HTML(value='')))




In [7]:
documents = []
for i, row in df.iterrows():
    document = TaggedDocument(row['vocabulary'], [i])
    documents.append(document)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
max_epochs = 100
vec_size = 50
alpha = 0.025

# Distributed memory and not distributed bag of words
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1,
                workers=8)

In [9]:
model.build_vocab(documents)

In [11]:
for _ in tqdm_notebook(range(max_epochs)):
    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.epochs,)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

HBox(children=(IntProgress(value=0), HTML(value='')))




In [12]:
model.save("article.d2v")