In [2]:
%reload_ext autoreload
%matplotlib inline

In [3]:
import sys
sys.path.append("../Utils/")

In [4]:
import tensorflow as tf 
import numpy as np 
import pandas as pd 
import nltk
import math
import random
from sklearn.utils import shuffle
from load_imdb_data import load_imdb_data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
np.random.seed(0)

In [5]:
imdb_data = load_imdb_data()

In [6]:
imdb_data = shuffle(imdb_data)
imdb_data.reset_index(drop=True, inplace=True)

In [7]:
len(imdb_data)

50000

In [8]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,often tagged as a comedy the man in the white ...,1
1,it s easy to make really general comments abou...,-1
2,i think the movie was one sided i watched it r...,-1
3,i have fond memories of watching this visually...,1
4,this episode had potential the basic premise o...,-1


In [9]:
def processDocs(documents, vocab_size=1000):
    """
    This functions takes in a collection of documents and generates a vocabulary based on the size given in input. 
    It returns a representation for each document in the list of input documents. 
    """
    vocab = {} 
    doc_id = 0 
    doc_ids = []
    
    for doc in documents:
        doc_ids.append(doc_id)                          # Give an ID to each document 
        doc_id += 1
        
        for word in nltk.word_tokenize(doc):            # Generate a vocabulary while iterating threw the documents 
            if word not in vocab:
                vocab[word] = 1 
            else:
                vocab[word] += 1
    
    # Extract the most frequent words based on the vocabulary size 
    freq_words_list = sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:vocab_size]
    freq_words_set = set([item[0] for item in freq_words_list])
    
    # Give an index to each word in vocabulary 
    word2idx = {}         
    index_word = 0
    for word in freq_words_set:
        word2idx[word] = index_word
        index_word += 1
    word2idx['UNK'] = index_word
    
    doc_repr = []                          # Represent each document with representation based on the vocabulary  
    for doc in documents:
        temp = []
        for w in doc:
            if w in word2idx:
                temp.append(word2idx[w])
            else:
                temp.append(word2idx['UNK'])
        doc_repr.append(temp)
        
    return documents, doc_ids, word2idx, doc_repr

In [10]:
docs, doc_ids, word2ids, doc_repr = processDocs(imdb_data['review'])

In [11]:
print(len(docs), len(doc_ids), len(word2ids), len(doc_repr))

(50000, 50000, 1001, 50000)


In [12]:
def performanceTest(X, y, method='tf-idf'):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y, 
                                                        test_size=0.2,  
                                                        random_state=42)
    
    if method == "tf-idf":
        vectorizer = TfidfVectorizer(min_df=1)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))
    else:
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))

In [13]:
performanceTest(imdb_data['review'], imdb_data['sentiment'])

('Train F1-score : ', 0.75810542821565297)
('Test F1-score : ', 0.65317266051591039)


## Architecture - PV-DM Distributed Memory version of Paragraph Vector

### Basic implementation

##### Initialize tensorflow matrix and matrices for word2vecs

In [610]:
doc_size = len(docs)
embedding_size_w = 100
embedding_size_d = 200
vocab_size = len(word2ids)
window_size = 3
n_neg_samples = 10
learning_rate = 0.001
epochs = 201
combined_embed_vector_length = embedding_size_d + embedding_size_w

In [611]:
# Define placeholders for training 
train_wX = tf.placeholder(tf.int32, shape=[window_size])
train_dX = tf.placeholder(tf.int32, shape=[1])
train_label = tf.placeholder(tf.int32, shape=[None, 1])

In [612]:
# Define matrix for doc_embedding and word_embedding 
doc_embedding = tf.Variable(tf.random_uniform([doc_size, embedding_size_d], -1.0, 1.0), name="doc_embedding")
word_embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size_w], -1.0, 1.0), name="word_embedding")

In [414]:
# Define weights for the output unit 
weights = tf.Variable(tf.truncated_normal([vocab_size, combined_embed_vector_length], 
                                       stddev=1.0 / math.sqrt(combined_embed_vector_length)))
biases = tf.Variable(tf.zeros(vocab_size))

In [415]:
embed = []

# generating a vector of size embedding_size_d
embed_w = tf.zeros([1, embedding_size_w], dtype=tf.float32)

# add all the word vecs in window_size
for j in range(window_size):
    embed_w += tf.nn.embedding_lookup(word_embedding, train_wX[j])
embed.append(embed_w)

# Add the doc2vec from the doc_embedding 
embed_d = tf.nn.embedding_lookup(doc_embedding, train_dX)
embed.append(embed_d)

embed = tf.concat(1, embed)
embed = tf.reshape(embed, [1, 30])

In [416]:
loss = tf.nn.sampled_softmax_loss(weights=weights, \
                                  biases=biases, \
                                  labels=train_label, \
                                  inputs=embed, \
                                  num_sampled=n_neg_samples, \
                                  num_classes=vocab_size)

In [417]:
loss = tf.reduce_mean(loss)

In [418]:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [419]:
saver = tf.train.Saver()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    average_loss = 0
    
    for step in range(epochs):
        epoch_error = 0.0
        for id_, repr_ in zip(doc_ids, doc_repr):
            if len(repr_) < window_size + 1:
                continue
            count = random.randint(1, 5)
            for cn in range(count):
                sample_window = random.sample(repr_, window_size + 1)

                feed_dict = {train_wX : sample_window[:-1],\
                             train_dX : [id_], \
                             train_label : np.array([sample_window[-1]]).reshape(-1, 1)}
                                
                #print feed_dict
                op, l = sess.run([optimizer, loss], 
                                    feed_dict=feed_dict)
                
                epoch_error += l
                
        if step % 10 == 0:
            print "Error at epoch : ", step, " = ", epoch_error
            
    save_path = saver.save(sess, "./models/model_pvdm.ckpt")
    print("Model saved in file: %s" % save_path)

Error at epoch :  0  =  2520.80032051
Error at epoch :  10  =  886.39744515
Error at epoch :  20  =  544.11700458
Error at epoch :  30  =  395.083735029
Error at epoch :  40  =  346.862789014
Error at epoch :  50  =  309.993764842
Error at epoch :  60  =  284.729109015
Error at epoch :  70  =  204.43311302
Error at epoch :  80  =  211.302371941
Error at epoch :  90  =  180.024087923
Error at epoch :  100  =  154.81982552
Error at epoch :  110  =  146.338269572
Error at epoch :  120  =  154.849999735
Error at epoch :  130  =  136.067183527
Error at epoch :  140  =  144.886522917
Error at epoch :  150  =  118.898776268
Error at epoch :  160  =  108.835737216
Error at epoch :  170  =  124.956207986
Error at epoch :  180  =  105.258146477
Error at epoch :  190  =  86.2899091992
Error at epoch :  200  =  112.497654662
Model saved in file: ./models/model.ckpt


In [420]:
doc_pvdm = None

with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "./models/model.ckpt")
    print("Model restored.")
    
    # Normalize word2vec 
    norm_w = tf.sqrt(tf.reduce_sum(tf.square(word_embedding), 1, keep_dims=True))
    normalized_word_embeddings = word_embedding / norm_w
    word2vec = normalized_word_embeddings.eval()
    
    # Normalize doc2vec 
    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embedding), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embedding / norm_d
    
    # Find performance 
    performanceTest(normalized_doc_embeddings.eval(), imdb_data['sentiment'][:1000], method=None)
    doc_pvdm = normalized_doc_embeddings.eval()

Model restored.
('Train F1-score : ', 0.60350877192982455)
('Test F1-score : ', 0.49038461538461536)


### Adding batchsizes for speedup

In [14]:
bucket_list = []

def generate_batch_pvdm(doc_ids, doc_repr, sample_size=5, batch_size=100, window_size=3):
    global bucket_list

    docs_ids_to_select = list(set(doc_ids) - set(bucket_list))
    
    
    if len(docs_ids_to_select) < batch_size//sample_size:
        bucket_list = []
        docs_ids_to_select = doc_ids
        
    index = 0 
    train_wX = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    train_dX = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    random_docs = random.sample(docs_ids_to_select, batch_size//sample_size)    # Choose set of random documents 

    bucket_list += random_docs
    
    for id_ in random_docs:
        for j in range(sample_size):                                 # Generating a dataset of sample size 
            sample_window = random.sample(doc_repr[id_], window_size + 1)
            train_wX[index] = sample_window[:-1]
            train_dX[index] = id_
            train_label[index] = sample_window[-1]  
            index += 1
    return train_wX, train_dX, train_label 

In [15]:
doc_size = len(docs)
embedding_size_w = 50
embedding_size_d = 50
vocab_size = len(word2ids)
window_size = 3
n_neg_samples = 10
learning_rate = 0.001
epochs = 10001
batch_size=100
combined_embed_vector_length = embedding_size_d + embedding_size_w

In [16]:
# Define placeholders for training 
train_wX = tf.placeholder(tf.int32, shape=[batch_size, window_size])
train_dX = tf.placeholder(tf.int32, shape=[batch_size, 1])
train_label = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [17]:
# Define matrix for doc_embedding and word_embedding 
doc_embedding = tf.Variable(tf.random_uniform([doc_size, embedding_size_d], -1.0, 1.0), name="doc_embedding")
word_embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size_w], -1.0, 1.0), name="word_embedding")

In [18]:
# Define weights for the output unit 
weights = tf.Variable(tf.truncated_normal([vocab_size, combined_embed_vector_length], 
                                       stddev=1.0 / math.sqrt(combined_embed_vector_length)))
biases = tf.Variable(tf.zeros(vocab_size))

In [19]:
embed = []

# generating a vector of size embedding_size_d
embed_w = tf.zeros([1, embedding_size_w], dtype=tf.float32)

# add all the word vecs in window_size
for j in range(window_size):
    embed_w += tf.nn.embedding_lookup(word_embedding, train_wX[:, j])
embed.append(embed_w)

# Add the doc2vec from the doc_embedding 
embed_d = tf.nn.embedding_lookup(doc_embedding, train_dX[:, 0])
embed.append(embed_d)

print embed_w, embed_d

embed = tf.concat(1, embed)

Tensor("add_2:0", shape=(100, 50), dtype=float32) Tensor("embedding_lookup_3:0", shape=(100, 50), dtype=float32)


In [20]:
loss = tf.nn.sampled_softmax_loss(weights=weights, \
                                  biases=biases, \
                                  labels=train_label, \
                                  inputs=embed, \
                                  num_sampled=n_neg_samples, \
                                  num_classes=vocab_size)

In [21]:
loss = tf.reduce_mean(loss)

In [22]:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [24]:
saver = tf.train.Saver()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    average_loss = 0
    
    for step in range(epochs):
        epoch_error = 0.0
        temp_wX , temp_dX, temp_labels = generate_batch_pvdm(doc_ids=doc_ids, doc_repr=doc_repr)
        feed_dict = {train_wX : temp_wX, train_dX : temp_dX,train_label : temp_labels}
        op, l = sess.run([optimizer, loss], 
                                    feed_dict=feed_dict)
        
        epoch_error += l
                
        if step % 100 == 0:
            print "Error at epoch : ", step, " = ", epoch_error
            
    save_path = saver.save(sess, "../models/model_pvdm_batch_training.ckpt")
    print("Model saved in file: %s" % save_path)

Error at epoch :  0  =  1.45035922527
Error at epoch :  100  =  0.861364662647
Error at epoch :  200  =  0.534089505672
Error at epoch :  300  =  0.224895864725
Error at epoch :  400  =  0.449635922909
Error at epoch :  500  =  0.843355894089
Error at epoch :  600  =  0.572692930698
Error at epoch :  700  =  0.493505209684
Error at epoch :  800  =  0.313163936138
Error at epoch :  900  =  0.19425176084
Error at epoch :  1000  =  0.163071140647
Error at epoch :  1100  =  0.411459475756
Error at epoch :  1200  =  0.381399065256
Error at epoch :  1300  =  0.184928506613
Error at epoch :  1400  =  0.30418291688
Error at epoch :  1500  =  0.266025871038
Error at epoch :  1600  =  0.139435440302
Error at epoch :  1700  =  0.360074162483
Error at epoch :  1800  =  0.204637780786
Error at epoch :  1900  =  0.142543405294
Error at epoch :  2000  =  0.180082321167
Error at epoch :  2100  =  0.183718562126
Error at epoch :  2200  =  0.15387262404
Error at epoch :  2300  =  0.0413739867508
Error a

### Evaluation of the representation 

In [643]:
doc_pvdm = None

with tf.Session() as sess:
    saver = tf.train.Saver()
    # Restore variables from disk.
    saver.restore(sess, "./models/model_pvdm_batch_training.ckpt")
    print("Model restored.")
    
    # Normalize word2vec 
    norm_w = tf.sqrt(tf.reduce_sum(tf.square(word_embedding), 1, keep_dims=True))
    normalized_word_embeddings = word_embedding / norm_w
    word2vec = normalized_word_embeddings.eval()
    
    # Normalize doc2vec 
    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embedding), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embedding / norm_d
    
    # Find performance 
    performanceTest(normalized_doc_embeddings.eval(), imdb_data['sentiment'], method=None)
    
    doc_pvdm = normalized_doc_embeddings.eval()

Model restored.
('Train F1-score : ', 0.50706870732759712)
('Test F1-score : ', 0.4936835773009825)
