### Connect to Kaggle

We will be using data available on Kaggle platform for this exercise. The data is available at https://www.kaggle.com/c/word2vec-nlp-tutorial/data. We will first connect Colab to Kaggle. Instructions for downloading kaggle data to Colab can be found [in this post](https://towardsdatascience.com/setting-up-kaggle-in-google-colab-ebb281b61463).

In [None]:
!pip install kaggle --quiet

In [None]:
#Make a directory for Kaggle
!mkdir .kaggle

In [None]:
#Connect Google drive to colab
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Copy kaggle.json file. Change gdrive folder based on where you have saved your json file from Kaggle
!cp '/gdrive/My Drive/AI-ML/Machine-Learning/Code/Utilities/kaggle.json' /content/.kaggle/kaggle.json

In [None]:
#Check if json file is there
!ls -l /content/.kaggle

In [None]:
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json

Verify Kaggle connection

In [None]:
!kaggle datasets list

#### Download Movie Reviews data

In [None]:
!kaggle competitions download -c word2vec-nlp-tutorial -p /content

In [None]:
#Confirm data has been downloaded
!ls -l

Import the dataset as pandas dataframe

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.groupby(['sentiment']).count()

In [None]:
df.loc[100, 'review']

Split Data into Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train.shape, test.shape

In [None]:
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

In [None]:
X_train = train['review']
y_train = train['sentiment']

In [None]:
X_test = test['review']
y_test = test['sentiment']

# Build the Tokenizer

In [None]:
import tensorflow as tf

In [None]:
desired_vocab_size = 10000 #Vocablury size
t = tf.keras.preprocessing.text.Tokenizer(num_words=desired_vocab_size) # num_words -> Vocablury size

In [None]:
#Fit tokenizer with actual training data
t.fit_on_texts(X_train.tolist())

In [None]:
len(t.word_index)

In [None]:
#Vocabulary
print(t.word_index)

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [None]:
X_train[0]

In [None]:
X_train = t.texts_to_sequences(X_train.tolist())

In [None]:
print(X_train[0])

In [None]:
t.sequences_to_texts([X_train[0]])

In [None]:
X_test = t.texts_to_sequences(X_test)

How many words in each review?

In [None]:
len(X_train[100])

# Pad Sequences - Important

In [None]:
#Define maximum number of words to consider in each review
max_review_length = 300

In [None]:
#Pad training and test reviews
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_review_length,
                                                        padding='pre', 
                                                        truncating='post')

X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, 
                                                       maxlen=max_review_length, 
                                                       padding='pre',
                                                       truncating='post')

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train[1000]

# Build the Graph

In [None]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [None]:
model.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    50, #Embedding size
                                    input_length=max_review_length) #Number of words in each review
          )

In [None]:
model.output

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [None]:
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.LSTM(128)) #RNN State - size of cell state and hidden state

In [None]:
model.output

In [None]:
model.add(tf.keras.layers.Dropout(0.4))

Use Dense layer for output layer

In [None]:
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

# Execute the graph

In [None]:
model.fit(X_train,y_train,
          epochs=5,
          batch_size=32,          
          validation_data=(X_test, y_test))

In [None]:
model.fit(X_train,y_train,
          initial_epoch=5,
          epochs=10,
          batch_size=32,          
          validation_data=(X_test, y_test))

#### Pre-Trained Embeddings

In [None]:
import gensim.downloader as api
import numpy as np

In [None]:
#Load Glove model (similar to Word2Vec)
glove_model = api.load('glove-wiki-gigaword-50')

In [None]:
#Size of the model
glove_model.vectors.shape

In [None]:
#Embedding for word great
glove_model['and']

In [None]:
#Initialize embedding matrix for our dataset with 10000+1 rows (1 for padding word)
#and 50 columns (as embedding size is 50)
embedding_matrix = np.zeros((desired_vocab_size + 1, 50))

In [None]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > (desired_vocab_size+1):
        break
    try:
        embedding_vector = glove_model[word] #Reading word's embedding from Glove model for a given word
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [None]:
embedding_matrix[200]

Build a Model with Pretained Embedding

In [None]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [None]:
model.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    50, #Embedding size
                                    weights=[embedding_matrix],
                                    trainable=False,
                                    input_length=max_review_length) #Number of words in each review
          )

In [None]:
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.LSTM(128)) #RNN State - size of cell state and hidden state
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train,y_train,
          epochs=5,
          batch_size=32,          
          validation_data=(X_test, y_test))

In [None]:
model.fit(X_train,y_train,
          initial_epoch=5,
          epochs=10,
          batch_size=32,          
          validation_data=(X_test, y_test))