In [1]:
%reload_ext autoreload
%matplotlib inline

In [4]:
import sys 
sys.path.append("../Utils/")

In [6]:
import tensorflow as tf 
import numpy as np 
import pandas as pd 
import nltk
import math
import random
from sklearn.utils import shuffle
from load_imdb_data import load_imdb_data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np 
np.random.seed(0)

In [7]:
imdb_data = load_imdb_data()

In [8]:
imdb_data = shuffle(imdb_data)
imdb_data.reset_index(drop=True, inplace=True)

In [9]:
len(imdb_data)

50000

In [10]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,often tagged as a comedy the man in the white ...,1
1,it s easy to make really general comments abou...,-1
2,i think the movie was one sided i watched it r...,-1
3,i have fond memories of watching this visually...,1
4,this episode had potential the basic premise o...,-1


In [11]:
def processDocs(documents, vocab_size=1000):
    """
    This functions takes in a collection of documents and generates a vocabulary based on the size given in input. 
    It returns a representation for each document in the list of input documents. 
    """
    vocab = {} 
    doc_id = 0 
    doc_ids = []
    
    for doc in documents:
        doc_ids.append(doc_id)                          # Give an ID to each document 
        doc_id += 1
        
        for word in nltk.word_tokenize(doc):            # Generate a vocabulary while iterating threw the documents 
            if word not in vocab:
                vocab[word] = 1 
            else:
                vocab[word] += 1
    
    # Extract the most frequent words based on the vocabulary size 
    freq_words_list = sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:vocab_size]
    freq_words_set = set([item[0] for item in freq_words_list])
    
    # Give an index to each word in vocabulary 
    word2idx = {}         
    index_word = 0
    for word in freq_words_set:
        word2idx[word] = index_word
        index_word += 1
    word2idx['UNK'] = index_word
    
    doc_repr = []                          # Represent each document with representation based on the vocabulary  
    for doc in documents:
        temp = []
        for w in doc:
            if w in word2idx:
                temp.append(word2idx[w])
            else:
                temp.append(word2idx['UNK'])
        doc_repr.append(temp)
        
    return documents, doc_ids, word2idx, doc_repr

In [12]:
docs, doc_ids, word2ids, doc_repr = processDocs(imdb_data['review'])

In [13]:
print(len(docs), len(doc_ids), len(word2ids), len(doc_repr))

(50000, 50000, 1001, 50000)


In [27]:
def performanceTest(X, y, method='tf-idf'):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y, 
                                                        test_size=0.2,  
                                                        random_state=42)
    
    if method == "tf-idf":
        vectorizer = TfidfVectorizer(min_df=1)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))
    else:
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        print("Train F1-score : ", f1_score(y_train, train_pred))
        print("Test F1-score : ", f1_score(y_test, test_pred))

In [28]:
performanceTest(imdb_data['review'], imdb_data['sentiment'])

('Train F1-score : ', 0.75810542821565297)
('Test F1-score : ', 0.65317266051591039)


## Architecture - PV-DBOW Distributed Bag of Words version of Paragraph Vector

In [390]:
doc_size = len(docs)
embedding_size_d = 20
vocab_size = len(word2ids)
window_size = 4
n_neg_samples = 10
learning_rate = 0.001
epochs = 201

In [391]:
# Define placeholders for training 
train_dX = tf.placeholder(tf.int32, shape=[1])
train_label = tf.placeholder(tf.int32, shape=[None, window_size])

In [392]:
# Define matrix for doc_embedding and word_embedding 
doc_embedding = tf.Variable(tf.random_uniform([doc_size, embedding_size_d], -1.0, 1.0), name="doc_embedding")
word_embedding = tf.Variable(tf.random_uniform([embedding_size_d, vocab_size], -1.0, 1.0), name="word_embedding")

In [397]:
# Define weights for the output unit 
weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size_d], 
                                       stddev=1.0 / math.sqrt(vocab_size)))
biases = tf.Variable(tf.zeros(vocab_size))

In [398]:
print weights.get_shape()

(501, 20)


In [399]:
# Add the doc2vec from the doc_embedding 
embed = tf.nn.embedding_lookup(doc_embedding, train_dX)

In [400]:
loss = tf.nn.sampled_softmax_loss(weights=weights, \
                                  biases=biases, \
                                  labels=train_label, \
                                  inputs=embed, \
                                  num_sampled=n_neg_samples, \
                                  num_classes=vocab_size, \
                                  num_true=window_size)

In [401]:
loss = tf.reduce_mean(loss)

In [402]:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [406]:
saver = tf.train.Saver()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    average_loss = 0
    
    for step in range(epochs):
        epoch_error = 0.0
        for id_, repr_ in zip(doc_ids, doc_repr):
            if len(repr_) < window_size + 1:
                continue
            count = random.randint(1, 5)
            for cn in range(count):
                sample_window = random.sample(repr_, window_size)

                feed_dict = {train_dX : [id_], \
                             train_label : np.array([sample_window]).reshape(1, -1)}
                                
                #print feed_dict
                op, l = sess.run([optimizer, loss], 
                                    feed_dict=feed_dict)
                
                epoch_error += l
                
        if step % 10 == 0:
            print "Error at epoch : ", step, " = ", epoch_error
            
    save_path = saver.save(sess, "./models/model_pvdbow.ckpt")
    print("Model saved in file: %s" % save_path)

Error at epoch :  0  =  5411.48152983
Error at epoch :  10  =  5289.10377419
Error at epoch :  20  =  5224.79287863
Error at epoch :  30  =  5015.29414165
Error at epoch :  40  =  5136.94008923
Error at epoch :  50  =  5098.15606439
Error at epoch :  60  =  4996.61689186
Error at epoch :  70  =  5001.05284905
Error at epoch :  80  =  4906.31645226
Error at epoch :  90  =  4929.5450964
Error at epoch :  100  =  5039.89336562
Error at epoch :  110  =  4948.42556465
Error at epoch :  120  =  4944.35927701
Error at epoch :  130  =  4878.96978402
Error at epoch :  140  =  4964.93613696
Error at epoch :  150  =  4927.08172655
Error at epoch :  160  =  5061.91432416
Error at epoch :  170  =  4858.45620561
Error at epoch :  180  =  4813.0437789
Error at epoch :  190  =  4652.80651069
Error at epoch :  200  =  4870.85287452
Model saved in file: ./models/model_pvdbow.ckpt


In [408]:
doc_pvdbow = None

with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "./models/model_pvdbow.ckpt")
    print("Model restored.")
    
    # Normalize word2vec 
    norm_w = tf.sqrt(tf.reduce_sum(tf.square(word_embedding), 1, keep_dims=True))
    normalized_word_embeddings = word_embedding / norm_w
    word2vec = normalized_word_embeddings.eval()
    
    # Normalize doc2vec 
    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embedding), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embedding / norm_d
    
    # Find performance 
    performanceTest(normalized_doc_embeddings.eval(), imdb_data['sentiment'][:1000], method=None)
    
    doc_pvdbow = normalized_doc_embeddings.eval()

Model restored.
('Train F1-score : ', 0.59320046893317702)
('Test F1-score : ', 0.58333333333333337)


### Adding batchsizes for speedup

In [14]:
bucket_list = []

def generate_batch_pvdbow(doc_ids, doc_repr, sample_size=5, batch_size=100, window_size=3):
    global bucket_list

    docs_ids_to_select = list(set(doc_ids) - set(bucket_list))
    
    
    if len(docs_ids_to_select) < batch_size//sample_size:
        bucket_list = []
        docs_ids_to_select = doc_ids
        
    index = 0 
    train_dX = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    train_label = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    random_docs = random.sample(docs_ids_to_select, batch_size//sample_size)    # Choose set of random documents 

    bucket_list += random_docs
    
    for id_ in random_docs:
        for j in range(sample_size):                                 # Generating a dataset of sample size 
            sample_window = random.sample(doc_repr[id_], window_size)
            train_dX[index] = id_
            train_label[index] = sample_window
            index += 1
    return train_dX, train_label 

In [15]:
doc_size = len(docs)
embedding_size_d = 50
vocab_size = len(word2ids)
window_size = 3
n_neg_samples = 10
learning_rate = 0.001
epochs = 10001
batch_size=100

In [16]:
# Define placeholders for training 
train_dX = tf.placeholder(tf.int32, shape=[batch_size, 1])
train_label = tf.placeholder(tf.int32, shape=[batch_size, window_size])

In [17]:
# Define matrix for doc_embedding and word_embedding 
doc_embedding = tf.Variable(tf.random_uniform([doc_size, embedding_size_d], -1.0, 1.0), name="doc_embedding")
word_embedding = tf.Variable(tf.random_uniform([embedding_size_d, vocab_size], -1.0, 1.0), name="word_embedding")

In [18]:
# Define weights for the output unit 
weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size_d], 
                                       stddev=1.0 / math.sqrt(vocab_size)))
biases = tf.Variable(tf.zeros(vocab_size))

In [19]:
print weights.get_shape()

(1001, 50)


In [20]:
# Add the doc2vec from the doc_embedding 
embed = tf.nn.embedding_lookup(doc_embedding, train_dX[:, 0])

In [21]:
loss = tf.nn.sampled_softmax_loss(weights=weights, \
                                  biases=biases, \
                                  labels=train_label, \
                                  inputs=embed, \
                                  num_sampled=n_neg_samples, \
                                  num_classes=vocab_size, \
                                  num_true=window_size)

In [22]:
loss = tf.reduce_mean(loss)

In [23]:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

In [25]:
saver = tf.train.Saver()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    average_loss = 0
    
    for step in range(epochs):
        epoch_error = 0.0
        temp_dX, temp_labels = generate_batch_pvdbow(doc_ids=doc_ids, doc_repr=doc_repr)
        feed_dict = {train_dX : temp_dX,train_label : temp_labels}
        op, l = sess.run([optimizer, loss], 
                                    feed_dict=feed_dict)
        
        epoch_error += l
                
        if step % 100 == 0:
            print "Error at epoch : ", step, " = ", epoch_error
            
    save_path = saver.save(sess, "../models/model_pvdbow_batch_training.ckpt")
    print("Model saved in file: %s" % save_path)

Error at epoch :  0  =  1.2712199688
Error at epoch :  100  =  1.9072239399
Error at epoch :  200  =  1.66182100773
Error at epoch :  300  =  1.56451749802
Error at epoch :  400  =  1.6622427702
Error at epoch :  500  =  1.84112548828
Error at epoch :  600  =  1.66757190228
Error at epoch :  700  =  1.43212127686
Error at epoch :  800  =  1.76986169815
Error at epoch :  900  =  1.53099024296
Error at epoch :  1000  =  1.31069195271
Error at epoch :  1100  =  1.74574828148
Error at epoch :  1200  =  1.53338778019
Error at epoch :  1300  =  1.65493130684
Error at epoch :  1400  =  1.67177212238
Error at epoch :  1500  =  1.72042775154
Error at epoch :  1600  =  1.70859539509
Error at epoch :  1700  =  1.35504710674
Error at epoch :  1800  =  1.73298084736
Error at epoch :  1900  =  1.69476103783
Error at epoch :  2000  =  1.61641597748
Error at epoch :  2100  =  1.48029506207
Error at epoch :  2200  =  1.30745637417
Error at epoch :  2300  =  1.55004942417
Error at epoch :  2400  =  1.56

### Evaluation of the representation 

In [29]:
doc_pvdbow = None

with tf.Session() as sess:
    saver = tf.train.Saver()
    # Restore variables from disk.
    saver.restore(sess, "../models/model_pvdbow_batch_training.ckpt")
    print("Model restored.")
    
    # Normalize word2vec 
    norm_w = tf.sqrt(tf.reduce_sum(tf.square(word_embedding), 1, keep_dims=True))
    normalized_word_embeddings = word_embedding / norm_w
    word2vec = normalized_word_embeddings.eval()
    
    # Normalize doc2vec 
    norm_d = tf.sqrt(tf.reduce_sum(tf.square(doc_embedding), 1, keep_dims=True))
    normalized_doc_embeddings = doc_embedding / norm_d
    
    # Find performance 
    performanceTest(normalized_doc_embeddings.eval(), imdb_data['sentiment'], method=None)
    
    doc_pvdbow = normalized_doc_embeddings.eval()

Model restored.
('Train F1-score : ', 0.52465608984576151)
('Test F1-score : ', 0.50605255388249193)
