In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import mpld3
mpld3.enable_notebook()

In [3]:
import sys
sys.path.append('../../Utils/')

In [4]:
from pylab import rcParams
rcParams['figure.figsize'] = 10, 10

In [35]:
import tensorflow as tf 
import numpy as np 
import pandas as pd 
import nltk
import math
import random
from sklearn.utils import shuffle
from load_imdb_data import load_imdb_data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
import matplotlib.pyplot as plt 
np.random.seed(0)

In [6]:
imdb_data = load_imdb_data()

In [7]:
imdb_data_1 = imdb_data[imdb_data['sentiment'] == 1].sample(5000)
imdb_data_0 = imdb_data[imdb_data['sentiment'] == -1].sample(5000)

In [8]:
imdb_data = imdb_data_1.append(imdb_data_0)

In [9]:
imdb_data = shuffle(imdb_data)
imdb_data.reset_index(drop=True, inplace=True)

In [10]:
len(imdb_data)

10000

In [11]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,i must admit out of the eros movie collection ...,-1
1,in my opinion of this movie the entire video p...,-1
2,this is my first cg animated film that i ve ev...,-1
3,avoid this movie if you are expecting the pose...,-1
4,i voted this a out of simply because it is the...,1


In [12]:
def processDocs(documents, vocab_size=10000):
    """
    This functions takes in a collection of documents and generates a vocabulary based on the size given in input. 
    It returns a representation for each document in the list of input documents. 
    """
    vocab = {} 
    doc_id = 0 
    doc_ids = []
    
    for doc in documents:
        doc_ids.append(doc_id)                          # Give an ID to each document 
        doc_id += 1
        
        for word in nltk.word_tokenize(doc):            # Generate a vocabulary while iterating threw the documents 
            if word not in vocab:
                vocab[word] = 1 
            else:
                vocab[word] += 1
    
    # Extract the most frequent words based on the vocabulary size 
    freq_words_list = sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:vocab_size]
    freq_words_set = set([item[0] for item in freq_words_list])
    
    # Give an index to each word in vocabulary 
    word2idx = {}         
    index_word = 0
    for word in freq_words_set:
        word2idx[word] = index_word
        index_word += 1
    word2idx['UNK'] = index_word
    
    doc_repr = []                          # Represent each document with representation based on the vocabulary  
    for doc in documents:
        temp = []
        for w in doc:
            if w in word2idx:
                temp.append(word2idx[w])
            else:
                temp.append(word2idx['UNK'])
        doc_repr.append(temp)
        
    return documents, doc_ids, word2idx, doc_repr

In [13]:
docs, doc_ids, word2ids, doc_repr = processDocs(imdb_data['review'])

In [14]:
print(len(docs), len(doc_ids), len(word2ids), len(doc_repr))

(10000, 10000, 10001, 10000)


In [38]:
def performanceTest(X, y, method='tf-idf'):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y, 
                                                        test_size=0.2,  
                                                        random_state=42)
    
    if method == "tf-idf":
        vectorizer = TfidfVectorizer(min_df=1)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train accuracy score : ", accuracy_score(y_train, train_pred))
        print("Test accuracy score : ", accuracy_score(y_test, test_pred))
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))
    else:
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train accuracy score : ", accuracy_score(y_train, train_pred))
        print("Test accuracy score : ", accuracy_score(y_test, test_pred))
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))

In [39]:
performanceTest(imdb_data['review'], imdb_data['sentiment'])

('Train accuracy score : ', 0.83287500000000003)
('Test accuracy score : ', 0.66200000000000003)
('Train F1-score : ', 0.84126795678499344)
('Test F1-score : ', 0.68352059925093633)


## Architecture - PV-DM Distributed Memory version of Paragraph Vector

### Adding batchsizes for speedup

In [17]:
bucket_list = []

def generate_batch_pvdm(doc_ids, doc_repr, sample_size=5, batch_size=1000, window_size=10):
    global bucket_list

    docs_ids_to_select = list(set(doc_ids) - set(bucket_list))
    
    
    if len(docs_ids_to_select) < batch_size//sample_size:
        bucket_list = []
        docs_ids_to_select = doc_ids
        
    index = 0 
    train_wX = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    train_dX = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    random_docs = random.sample(docs_ids_to_select, batch_size//sample_size)    # Choose set of random documents 

    bucket_list += random_docs
    
    for id_ in random_docs:
        for j in range(sample_size):                                 # Generating a dataset of sample size 
            random_index = random.randint(0, len(doc_repr[id_]) - window_size - 1)
            sample_window = doc_repr[id_][random_index: random_index + window_size + 1]
            train_wX[index] = sample_window[:-1]
            train_dX[index] = id_
            train_label[index] = sample_window[-1]  
            index += 1
    return train_wX, train_dX, train_label 

In [18]:
doc_size = len(docs)
embedding_size_w = 100
embedding_size_d = 100
vocab_size = len(word2ids)
window_size = 10
n_neg_samples = 10
learning_rate = 10e-5
epochs = 10001
batch_size=1000
mu=0.99
combined_embed_vector_length = embedding_size_d + embedding_size_w

In [19]:
# Define placeholders for training 
train_wX = tf.placeholder(tf.int32, shape=[batch_size, window_size])
train_dX = tf.placeholder(tf.int32, shape=[batch_size, 1])
train_label = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [20]:
# Define matrix for doc_embedding and word_embedding 
doc_embedding = tf.Variable(tf.random_uniform([doc_size, embedding_size_d],
                                              -np.sqrt(6/(doc_size + embedding_size_d)), 
                                              np.sqrt(6/(doc_size + embedding_size_d))), 
                                              name="doc_embedding")
word_embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size_w],
                                               -np.sqrt(6/(doc_size + embedding_size_d)),
                                               np.sqrt(6/(doc_size + embedding_size_d)))
                                               ,name="word_embedding")

In [21]:
# Define weights for the output unit 
weights = tf.Variable(tf.truncated_normal([vocab_size, combined_embed_vector_length], 
                                       stddev=1.0 / math.sqrt(combined_embed_vector_length)))
biases = tf.Variable(tf.zeros(vocab_size))

In [22]:
embed = []

# generating a vector of size embedding_size_d
embed_w = tf.zeros([1, embedding_size_w], dtype=tf.float32)

# add all the word vecs in window_size
for j in range(window_size):
    embed_w += tf.nn.embedding_lookup(word_embedding, train_wX[:, j])
embed.append(embed_w)

# Add the doc2vec from the doc_embedding 
embed_d = tf.nn.embedding_lookup(doc_embedding, train_dX[:, 0])
embed.append(embed_d)

print embed_w, embed_d

embed = tf.concat(embed, 1)

Tensor("add_9:0", shape=(1000, 100), dtype=float32) Tensor("embedding_lookup_10:0", shape=(1000, 100), dtype=float32)


In [23]:
loss = tf.nn.sampled_softmax_loss(weights=weights, \
                                  biases=biases, \
                                  labels=train_label, \
                                  inputs=embed, \
                                  num_sampled=n_neg_samples, \
                                  num_classes=vocab_size)

In [24]:
loss = tf.reduce_mean(loss)

In [25]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=mu).minimize(loss)
#optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

"""
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.01
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, 0.96, staircase=True)
# Passing global_step to minimize() will increment it at each step.
optimizer = (
    tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(loss, global_step=global_step)
)
"""

'\nglobal_step = tf.Variable(0, trainable=False)\nstarter_learning_rate = 0.01\nlearning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,\n                                           1000, 0.96, staircase=True)\n# Passing global_step to minimize() will increment it at each step.\noptimizer = (\n    tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(loss, global_step=global_step)\n)\n'

In [45]:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [None]:
saver = tf.train.Saver()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    average_loss = 0
    
    for step in range(epochs):
        epoch_error = 0.0
        temp_wX , temp_dX, temp_labels = generate_batch_pvdm(doc_ids=doc_ids, doc_repr=doc_repr)
        feed_dict = {train_wX : temp_wX, train_dX : temp_dX,train_label : temp_labels}
        op, l = sess.run([optimizer, loss], 
                                    feed_dict=feed_dict)
        
        epoch_error += l
                
        if step % 1000 == 0:
            print "Error at epoch : ", step, " = ", epoch_error
            
    save_path = saver.save(sess, "/dev/shm/tensorflow_models/model_pvdm_batch_training.ckpt")
    print("Model saved in file: %s" % save_path)

Error at epoch :  0  =  0.633739113808
Error at epoch :  1000  =  1.93807947636
Error at epoch :  2000  =  1.01855492592


### Evaluation of the representation 

In [41]:
doc_pvdm = None

with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, "/dev/shm/tensorflow_models/model_pvdm_batch_training.ckpt")
    print("Model restored.")
    doc2vec = doc_embedding.eval()
    performanceTest(doc2vec, list(imdb_data['sentiment']), method=None)

Model restored.
('Train accuracy score : ', 0.501)
('Test accuracy score : ', 0.496)
('Train F1-score : ', 0.0)
('Test F1-score : ', 0.0)
