In [21]:
# import libraries for pre-processing
import numpy as np
import pandas as pd
from sklearn import svm
import random
import math
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression 
from hazem_doc2vec.helper_functions import in_pickle, out_pickle, shuffle_corpus_labels
from sklearn.metrics import confusion_matrix
#import data_analysis.preprocessor_end as pre
import os

In [None]:
# Function used to divide both negative and positive text files into 2 parts, one for
# test and the other for train. It returns two lists for train, test.
def divide_corpus(p_corpus, n_corpus, factor):
    train_corpus = []
    train_labels = []

    test_corpus = []
    test_labels = []

    # For positive dataset
    n_pos = int(math.ceil(factor * len(p_corpus)))
    for doc_id in range(len(p_corpus)):
        if doc_id < n_pos:
            train_corpus.append(TaggedDocument(p_corpus[doc_id], [doc_id]))
            train_labels.append([1])
        else:
            test_corpus.append(p_corpus[doc_id])
            test_labels.append([1])
    
    # For negative dataset
    n_neg = int(math.ceil(factor * len(n_corpus))) 
    for doc_id in range(len(n_corpus)):
        if doc_id < n_neg:
            train_corpus.append(TaggedDocument(n_corpus[doc_id], [int(n_pos + doc_id)]))
            train_labels.append([0])
        else:
            test_corpus.append(n_corpus[doc_id])
            test_labels.append([0])
            
    return train_corpus, train_labels, test_corpus, test_labels


def prepare_classifier_data(model, labels_arr):
    
    x = np.array(model.docvecs.vectors_docs)
    y = np.zeros(model.docvecs.count, dtype=np.int)
    
    for i in range(model.docvecs.count):
        y[i] = labels_arr[i][0]
    return x, y


def prepare_classifier_test_arrays(model, test_corpus, labels_arr):
    test_arrays = np.zeros([len(test_corpus), model.vector_size])
    test_labels_arrays = np.zeros(len(test_corpus), dtype=np.int)

    # Shuffle test data
    test_corpus, labels_arr = shuffle_corpus_labels(test_corpus, labels_arr)

    for i in range(len(test_corpus)):
        test_arrays[i] = model.infer_vector(test_corpus[i])
        test_labels_arrays[i] = labels_arr[i][0]
    return test_arrays, test_labels_arrays

In [14]:
%%time
pos_corpus = in_pickle('data/pos_corpus')
neg_corpus = in_pickle('data/neg_corpus')



CPU times: user 3.8 s, sys: 795 ms, total: 4.59 s
Wall time: 4.66 s


In [15]:
%%time
corpus, labels, temp_x, temp_y = divide_corpus(pos_corpus, neg_corpus, 1)
out_pickle("data/corpus", corpus)
out_pickle("data/labels", labels)

CPU times: user 7.15 s, sys: 672 ms, total: 7.82 s
Wall time: 8.75 s


In [16]:
%%time
corpus = in_pickle('data/corpus')
labels = in_pickle('data/labels')

6294 6294
CPU times: user 4.13 s, sys: 678 ms, total: 4.8 s
Wall time: 4.87 s


In [22]:
%%time
# Model's parameter
max_epochs = 5
vec_size = 300
alpha = 0.025

# Note: defining 'dm=1' is important here. It means that we have selected 
# distributed memory’ (PV-DM) over ‘distributed bag of words’ (PV-DBOW) 'dm =0'
# Which doesn't preserve the order of the words.
model = Doc2Vec(min_count=1, dm=1, workers=16, window=10, vector_size=vec_size, 
                alpha=alpha, min_alpha=0.00025)

# Setting up the vocabulary 
model.build_vocab(corpus)

for epoch in range(max_epochs):
    new_corpus, new_labels = shuffle_corpus_labels(corpus, labels)
    corpus = new_corpus
    labels = new_labels

    print('iteration_{0}'.format(epoch), end='\t')

    model.train(corpus, total_examples=len(corpus), epochs=model.epochs)
        
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("\nModel Saved\n")


iteration_0	

iteration_1	

iteration_2	

iteration_3	

iteration_4	


Model Saved

CPU times: user 1h 13min 23s, sys: 28.6 s, total: 1h 13min 52s
Wall time: 23min 35s


In [35]:
%%time
# Loading the saved doc2vec model
model = Doc2Vec.load('d2v.model')

# X: numpy array, holds the corpus, each doc in the corpus is represented in 300-vector
# Y: numpy array, holds the labels of each doc in the corpus
X, Y = prepare_classifier_data(model, labels)

out_pickle('data/X', X)
out_pickle('data/Y', Y)

CPU times: user 1.33 s, sys: 116 ms, total: 1.44 s
Wall time: 1.44 s


In [37]:
%%time
X = in_pickle('data/X')
Y = in_pickle('data/Y')

CPU times: user 1.08 ms, sys: 4.01 ms, total: 5.09 ms
Wall time: 4.25 ms
