# 5. homework for BME Deep Learning course

Based on: https://github.com/BME-SmartLab-Education/vitmav45/blob/master/11/11-02-Word-Embeddings-Keras.ipynb
& https://github.com/BME-SmartLab-Education/vitmav45-2016-Epochalypse/blob/master/Final_Neural_Networks/1D_CNN_LSTM.ipynb

Written by Moró Anna

In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

import keras
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Activation, Dropout, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model,Sequential
from keras.utils.data_utils import get_file
import random
import sys

BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/GloVe-1.2/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/books'
MAX_NB_WORDS = 20000      # Maximum number of different words
EMBEDDING_DIM = 100       # Dimension of the embedding
LENGTH_OF_CHUNKS = 100    # Length of the text-pieces used for the neural network (100 words/sample)

Using TensorFlow backend.


# Import GloVe embeddings

In [2]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Number of loaded embeddings:', len(embeddings_index))

Number of loaded embeddings: 400000


# Import the books

I will use 9 books, 3 books from 3 authors:
    Jane Austen: Pride and Prejudice, Sense and Sensibility, Mansfield Park;
    Edgar Allan Poe: The masque of the red death, The cask of amontillado, The fall of the house Usher;
    H. G. Wells: The time machine, The island of Dr Moreau, The red room

In [3]:
texts = []         # text of books
labels_index = {}  # IDs of the authors
labels = []        # for the storage of IDs

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    print (name)
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        print(label_id)
        labels_index[name] = label_id # new ID for a new author
        for fname in sorted(os.listdir(path)):            
            fpath = os.path.join(path, fname)
            print(fpath)
            f = open(fpath)
            texts.append(f.read())
            f.close()
            labels.append(label_id)

print('Number of books: ', len(texts))

austen
0
./books/austen/mansfield.txt
./books/austen/pride.txt
./books/austen/sense.txt
poe
1
./books/poe/cask.txt
./books/poe/fall.txt
./books/poe/masque.txt
wells
2
./books/wells/island.txt
./books/wells/redRoom.txt
./books/wells/timeMachine.txt
Number of books:  9


# Tokenization of the texts

In [4]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('The number of different words found in all of the books: ', len(word_index))

#One-hot coding for the labels (author IDs)
labels = to_categorical(np.asarray(labels))

The number of different words found in all of the books:  19909


# Function to cut up the "texts" into smaller parts

(The length of the pieces can be set with the variable LENGTH_OF_CHUNKS)

In [6]:
def cutUp(sequence): 
    
    # This line does the actual cutting:
    data = [sequence[i:i + LENGTH_OF_CHUNKS] for i in range(0, len(sequence), LENGTH_OF_CHUNKS)]
    
    # If the last line of the samples is smaller than the given length, it will be thrown away
    if(len(sequence) / LENGTH_OF_CHUNKS != 0):
        data = np.delete(data, (len(data)-1), axis=0)  
        
    return data, len(data)

# Transforming the dataset and the labels

In [7]:
x_data = []     # input data 
y = []          #labels

for i in range (0, 9):
    data_raw, length = cutUp(sequences[i])  # Cut into smaller parts
    x_data = np.append(x_data, np.squeeze(np.asarray(data_raw)))
    x_data = np.squeeze(np.asarray(x_data))
    
    for j in range (0,length):    # Fill up an array with the labels (in the same order as the data pieces)
        y[len(y):] = [labels[i]]
    y_data = np.squeeze(np.asarray(y))
    
print("Input data: ", x_data.shape)
print("Labels: ", y_data.shape)

Input data:  (5459,)
Labels:  (5459, 3)


# Divide the dataset into train, validation and test data

Train data: 75%, validation and test data: 15-15%

In [8]:
# To shuffle the data, shuffle a vector which contains the available indices
indices = np.arange(x_data.shape[0]) 
np.random.shuffle(indices)

# The shuffle:
data = x_data[indices]
labels = y_data[indices]

# The number of samples of train, test and validation
nb_train_samples = int(len(data)*0.7)
nb_val_samples = int(len(data)*0.15)
nb_test_samples = int(len(data)*0.15)

train_end = nb_train_samples
val_end = nb_train_samples + nb_val_samples

x_tr = data[:train_end]
y_train = labels[:train_end]
x_v = data[train_end:val_end]
y_val = labels[train_end:val_end]
x_t = data[val_end:]
y_test = labels[val_end:]

# Reshape the data matrices

In this shape the data can't be processed by a neural network, it has to be rearranged.

In [9]:
x_train = np.zeros((x_tr.size, LENGTH_OF_CHUNKS))
x_val = np.zeros((x_v.size, LENGTH_OF_CHUNKS))
x_test = np.zeros((x_t.size, LENGTH_OF_CHUNKS))

for i in range (0, x_tr.size):
    x_train[i] = x_tr[i]
    
for i in range (0, x_v.size):
    x_val[i] = x_v[i]
    
for i in range (0, x_t.size):
    x_test[i] = x_t[i]
    
print("Train data shape: ", x_train.shape)
print("Validation data shape: ", x_val.shape)
print("Test data shape: ", x_test.shape)

Train data shape:  (3821, 100)
Validation data shape:  (818, 100)
Test data shape:  (820, 100)


# Use embedding on the texts

In [10]:
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) 
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        
print ('Number of lines of the embedding matrix:', len(embedding_matrix))

Number of lines of the embedding matrix: 19910


# Building the RNN

It contains an embedding layer, two sets of 1D convolutional layer-max-pooling-dropout, an LSTM after these and a fully connected layer with softmax by the end. Also, early stopping is used during the training.

In [11]:
earlyStopping=keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')

model = Sequential()

model.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=LENGTH_OF_CHUNKS,
                            trainable=False))    # In the first turn, the embedding layers weights' will be freezed

model.add(Conv1D(nb_filter=128,filter_length=5, border_mode='valid', activation='relu', subsample_length=2))
model.add(MaxPooling1D(pool_length=5))
model.add(Dropout(0.5))

model.add(Conv1D(nb_filter=128,filter_length=5, border_mode='valid', activation='relu', subsample_length=2))
model.add(MaxPooling1D(pool_length=3))
model.add(Dropout(0.5))

model.add(LSTM(50))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [12]:
model.fit(x_train, y_train, callbacks=[earlyStopping], validation_data=(x_val, y_val), nb_epoch=15, batch_size=50)

model.evaluate(x_test, y_test)

Train on 3821 samples, validate on 818 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


[0.21692888189743204, 0.92682926800192855]

# After it, the embedding layer will be trained

In [13]:
model.layers[0].trainable=True # This allows the embedding layer to be trained

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train, callbacks=[earlyStopping], validation_data=(x_val, y_val), nb_epoch=10, batch_size=50)

model.evaluate(x_test, y_test)

Train on 3821 samples, validate on 818 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.23516770859317082, 0.92926829268292688]