## Word Embeddings

Use IMDB movie review dataset and create word embeddings that are essential for any task involving text data

In [0]:
import os
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense  
from keras import preprocessing
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [0]:
#Gain access to Google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
max_features = 10000  #No of words to consider as features
maxlen = 20     # max length of a sample

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen) #turns list of integers into a 2D integer tensor of shape (samples,maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test,maxlen=maxlen)

Using an Embedding layer and classifier on the IMDB data

In [0]:
model = Sequential()
model.add(Embedding(10000,8,input_length=maxlen))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.summary()

history = model.fit(x_train,y_train, epochs=10, batch_size=32, validation_split=0.2)

By looking at only the first 20 words and training a single Dense layer on top, we still get a validation accuracy of ~76%.  
We can improve this further by using pretrained word embeddings alongwith recurrent layers of 1D conv layers to learn features that take into account each sequence as a whole.

In [0]:
#Load raw IMDB text
imdb_dir = './gdrive/My Drive/Colab Notebooks/data/aclImdb'
train_dir = os.path.join(imdb_dir,'train')

labels, texts = [],[]

for label_type in ['neg','pos']:
  dir_name = os.path.join(train_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name,fname))
      texts.append(f.read())
      f.close()
      
      if label_type=='neg':
        labels.append(0)
      else:
        labels.append(1)

len(texts)

Tokenizing the data

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 100  #cut of reviews after 100 words
training_samples, validation_samples = 200, 1000 #small training samples since we use pretrained embeddings    
max_words = 10000  #consider top 10k words in dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)  #convert from words to integer vectors
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen) 

labels = np.asarray(labels)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])  #Split data into training and validation after shuffling it because samples are ordered 
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples : training_samples+validation_samples]
y_val = labels[training_samples : training_samples+validation_samples]

Parse the GloVe word embeddings

In [0]:
glove_dir = './gdrive/My Drive/Colab Notebooks/data/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Build an embedding matrix to load into an Embedding layer. Matrix shape (max_words, embedding_dim)

In [0]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words,embedding_dim))

for word, i in word_index.items():
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

Define the model - similar as the one we used before

In [0]:
model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

#Load the pretrained embeddings into the embedding layer
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

model.summary()

In [0]:
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['acc'])

history = model.fit(x_train,y_train,
                   epochs=10,
                   batch_size=32,
                   validation_data=(x_val,y_val))

model.save('gdrive/My Drive/Colab Notebooks/models/pretrained_glove_model.h5')

In [0]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1,len(loss)+1)

plt.plot(epochs,acc,'bo',label='Training accuracy')
plt.plot(epochs,val_acc,'b',label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

The model quickly overfits since we used limited training samples.  
To improve model performance-
1) Skip using pretrained embeddings and increase training sample size to learn task specific embedding

Tokenize test data 

In [0]:
test_dir = os.path.join(imdb_dir,'test')

labels_test, texts_test = [], []

for label_type in ['pos','neg']:
  dir_name = os.path.join(test_dir,label_type)
  for fname in sorted(os.listdir(dir_name)):
    if fname[-4:]=='.txt':
      f = open(os.path.join(dir_name,fname))
      texts_test.append(f.read())
      f.close()
      
      if label_type == 'neg':
        labels_test.append(0)
      else:
        labels_test.append(1)
        
sequences = tokenizer.texts_to_sequences(texts_test)
x_test = pad_sequences(sequences,maxlen=maxlen)
y_test = np.asarray(labels_test)

model.evaluate(x_test,y_test)