<a href="https://colab.research.google.com/github/bclee232/DLwP/blob/master/8_embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# word level one-hot encoding
from keras.preprocessing.text import Tokenizer

samples = ['The quick brown fox.', 'The cat ate the homework.']

tokenizer = Tokenizer(num_words=1000) # creates tokenizer for 1000 most common words
tokenizer.fit_on_texts(samples) # builds the word index
seq = tokenizer.texts_to_sequences(samples) # turns str in samples into list of int indices
one_hot_res = tokenizer.texts_to_matrix(samples, mode='binary')

word_ind = tokenizer.word_index
print('Found %s unique tokens.' % len(word_ind))
print('word_ind:', word_ind, 'one_hot_res', one_hot_res, 'seq:', seq, 'tokenizer:', tokenizer)
# "the" and "The" are the same words?

Found 7 unique tokens.
word_ind: {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'cat': 5, 'ate': 6, 'homework': 7} one_hot_res [[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]] seq: [[1, 2, 3, 4], [1, 5, 6, 1, 7]] tokenizer: <keras_preprocessing.text.Tokenizer object at 0x7fa486fc1cc0>


In [7]:
# one-hot hashing trick
import numpy as np

dimensionality = 100 # dimensionality >>> max_len to min. hash collisions
max_len = 10

res = np.zeros((len(samples), max_len, dimensionality))
for i, sample in enumerate(samples):
  for j, word in list(enumerate(sample.split()))[:max_len]:
    ind = abs(hash(word)) % dimensionality
    res[i, j, ind] = 1

print(res)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 1. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [8]:
# load and preprocess IMDB data for Embedding layer
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
(train_data, train_targets), (test_data, test_targets) = imdb.load_data(num_words=max_features)

max_len = 20
# convert to 2D int tensor of shape (samples, seq_len)
x_train = preprocessing.sequence.pad_sequences(train_data,
                                               maxlen=max_len)
x_test = preprocessing.sequence.pad_sequences(test_data,
                                              maxlen=max_len)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [9]:
# Embedding layer with Dense layer on top
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

dim = 8
model = Sequential()
# make activations to 3D tensor of shape (max_features, max_len, dim)
model.add(Embedding(max_features, dim, input_length=max_len))
# flatten to 2D tensor of shape (max_featuers, max_len * dim)
model.add(Flatten())
# add Dense classifier on top
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [10]:
hist = model.fit(x_train, train_targets, batch_size=32, epochs=10, 
                 validation_split=0.2)
# peak 76% val_acc

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import os

base_dir = '/content/drive/My Drive/imdb_small'
train_dir = os.path.join(base_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
                
print(len(labels), len(texts))

202 202


In [18]:
# tokenise the raw texts
maxlen = 102 # cuts off reviews after 102 words
training_samp = 100
valid_samp = 100
max_words = 10002 # top 10002 words in dataset

t = Tokenizer(max_words)
t.fit_on_texts(texts)
seq = t.texts_to_sequences(texts)
# no word matrix?
word_ind = t.word_index
print(len(word_ind))

6664


In [19]:
data = preprocessing.sequence.pad_sequences(seq, maxlen=maxlen)
lab = np.asarray(labels)
print(data.shape, lab.shape)

(202, 102) (202,)


In [21]:
i = np.arange(data.shape[0])
# shuffle data
np.random.shuffle(i)
data = data[i]
lab = lab[i]
# split data into training and validation
x_train = data[:training_samp]
y_train = lab[:training_samp]
x_val = data[training_samp:training_samp+valid_samp]
y_val = lab[training_samp:training_samp+valid_samp]

In [22]:
# using task specific embedding with small input as baseline
dim = 8
m = Sequential()
# make activations to 3D tensor of shape (max_features, max_len, dim)
m.add(Embedding(max_words, dim, input_length=maxlen))
# flatten to 2D tensor of shape (max_featuers, max_len * dim)
m.add(Flatten())
# add Dense classifier on top
m.add(Dense(1, activation='sigmoid'))
m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [25]:
history = m.fit(x_train, y_train, 30, 10, validation_data=(x_val, y_val))
# val_acc of 53%

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 100 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# parse in glove embedding file
path = '/content/drive/My Drive/glove/glove.6B.100d.txt'
embeddings_index = {}
f = open(path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [28]:
# load embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_ind.items():
  if i < max_words:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [29]:
# define net architecture
net = Sequential()
net.add(Embedding(max_words, embedding_dim, input_length=maxlen))
net.add(Flatten())
net.add(Dense(32, activation='relu'))
net.add(Dense(1, activation='sigmoid'))
net.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 102, 100)          1000200   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10200)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                326432    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,326,665
Trainable params: 1,326,665
Non-trainable params: 0
_________________________________________________________________


In [30]:
# load Glove into network
first = net.layers[0]
first.set_weights([embedding_matrix])
first.trainable = False

In [31]:
# train and eval
net.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
net.fit(x_train, y_train, batch_size=32, epochs=10, 
        validation_data=(x_val, y_val))
net.save_weights('pre_trained_glove_model.h5')
# peak accuracy of 54% (slightly better than task-specific?)
# test on actual test data (but takes time to load)

Train on 100 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
