# Case Study: Learning Embeddings from Scratch vs. Pretrained Word Embeddings

In [1]:
#importing libraries
import pandas as pd
import numpy as np

#reading csv files
train = pd.read_csv('sentiment-analysis/Train.csv')
valid = pd.read_csv('sentiment-analysis/Valid.csv')             

#train_test split
x_tr, y_tr = train['text'].values, train['label'].values
x_val, y_val = valid['text'].values, valid['label'].values

In [2]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)

Using TensorFlow backend.


In [4]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

112204


We will build two different NLP models of the same architecture. The first model learns embeddings from scratch and the second model uses pretrained word embeddings.

In [9]:
# Defining the architecture – Learning Embeddings from scratch:
#deep learning library
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model=Sequential()

#embedding layer
model.add(Embedding(input_dim=size_of_vocabulary,output_dim=300,input_length=100,trainable=True)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          33661200  
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 33,889,169
Trainable params: 33,889,169
Non-trainable params: 0
_________________________________________________________________
None


The total number of trainable parameters in the model is 33,889,169. Out of this, the Embedding layer contributes to 33,661,200 parameters. That’s huge!

In [10]:
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=10,validation_data=(np.array(x_val_seq),np.array(y_val)),verbose=1,callbacks=[es,mc])

Train on 40000 samples, validate on 5000 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.86520, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.86520
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.86520
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.86520
Epoch 00004: early stopping


In [11]:
# Evaluating the performance of the model:

#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,val_acc = model.evaluate(x_val_seq,y_val, batch_size=128)
print(val_acc)

0.8652


In [18]:
# build version II using GloVe pretrained word embeddings

# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [19]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
# Defining the Architecture – Pretrained embeddings:

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=100,trainable=False)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          33661200  
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 33,889,169
Trainable params: 227,969
Non-trainable params: 33,661,200
_________________________________________________________________
None


The number of trainable parameters is just 227,969. That’s a huge drop compared to the embedding layer.

In [21]:
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=10,validation_data=(np.array(x_val_seq),np.array(y_val)),verbose=1,callbacks=[es,mc])

Train on 40000 samples, validate on 5000 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.83940, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.83940 to 0.86240, saving model to best_model.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.86240
Epoch 4/10

Epoch 00004: val_acc improved from 0.86240 to 0.86720, saving model to best_model.h5
Epoch 5/10

Epoch 00005: val_acc improved from 0.86720 to 0.87360, saving model to best_model.h5
Epoch 6/10

Epoch 00006: val_acc improved from 0.87360 to 0.87660, saving model to best_model.h5
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.87660
Epoch 8/10

Epoch 00008: val_acc improved from 0.87660 to 0.87740, saving model to best_model.h5
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.87740
Epoch 00009: early stopping


In [22]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,val_acc = model.evaluate(x_val_seq,y_val, batch_size=128)
print(val_acc)

0.8774
