In [None]:
import json
import spacy
import nltk
import numpy as np
from tqdm import tqdm, tqdm_notebook
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import gensim
import seaborn as sns
from xgboost.sklearn import XGBClassifier 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from gensim.models.callbacks import CallbackAny2Vec
import warnings
warnings.filterwarnings('ignore')


In [None]:
def read_corpus(corpus, tokens_only=False):
    '''
    Input:
    corpus: Refers to each document stored in a single string
    token_only: if 'True', tags the data (used for test)
    
    Output:
    Returns document text that is pre-processed and cleaned by gensim preprocess module
    
    '''
    for i, line in tqdm_notebook(enumerate(corpus), total = len(corpus)):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
            

class EpochLogger(CallbackAny2Vec):
    
    '''
    Records the Start and End of an Epoch to print a message accordingly 
    '''
    
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
             print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
            print("Epoch #{} end".format(self.epoch))
            self.epoch += 1
            
            
def Doc2vec(train_corpus, test_corpus,vector_size=300, min_count=2, epochs=10):
    '''
    Input:
    train_corpus: train corpus that has been preprocess by gensim preprocess module
    test_corpus: test corpus that has been tagged and preprocessed by gensim preprocess module
    vector_size: dimension of doc2vec embedding
    min_count: window size for the skip_gram model
    epochs: No: of epochs to train the model
    
    Output:
    Saves the train/test embeddings 
    '''
    
    #Train Doc2vec model with vector size 300 for 10 epochs
    model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs, workers= 4,compute_loss = True)
    model.build_vocab(train_corpus)
    epoch_logger = EpochLogger()
    
    #Train model for 10 epochs
    model.train(train_corpus,  total_examples=model.corpus_count,epochs=model.epochs, callbacks=[epoch_logger])
    
    #Obtain Doc2vec vectors of the train corpus
    X_train_vector = np.array([model.docvecs[i] for i in range(len(train_corpus))])
    
    #Infer test corpus embeddings from the trained model
    X_test_vector = np.array([model.infer_vector(test_corpus[i]) for i in range(len(test_corpus))])
    
    np.save('../../data/feature/Doc2vec_train_embeddings', X_train_vector)
    np.save('../../data/feature/Doc2vec_test_embeddings', X_test_vector)
    
    


def main(X,Y,split=0.2):
    '''
    Splits the data into train/test and trains Doc2Vec model
    The doc2vec embeddings are saved in the 'data/feature' directory
    
    Input:
    X: Data to be used for doc2vec features
    Y: corresponding labels
    split: Test set size percentage to split
    
    Output:
    Saves the train/test embeddings 
    '''
    
    #split the data into train/test - 80-20 split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=split, random_state=42, shuffle = True)

    train_corpus = list(read_corpus(X_train))
    test_corpus = list(read_corpus(X_test, tokens_only=True))
    
    # Train Doc2Vec model
    # Change parameter values 'vector_size=300, min_count=2, epochs=10' to modify doc2vec training
    Doc2vec(train_corpus,test_corpus)
    
    


if __name__=="__main__":

    #Relevant news articles
    rdata = pd.read_json('../../data/clean/relevant_news_10K.json')
    #Irrelevant news articles
    irdata = pd.read_json('../../data/clean/irrelevant_news_10K.json')

    rdata['relevance']=1  #Relevant
    irdata['relevance']=0 #Irrelevant
    #Combine the data
    data = rdata.append(irdata_json, ignore_index=True)

    # Let's use content as our data
    X = data['content']
    Y = data['relevance']
    
    #specify parameter 'split' for modifying test set size, default is 20%
    main(X,Y)
    
    

   