In [16]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer, hashing_trick, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import collections

# Load Data

In [17]:
dataPath = '/Users/Benjy/Documents/Modelling/CBTest/data'

In [18]:
def readWords(filename):
    #split file into separate words
    with tf.gfile.GFile(filename,"rb") as f:
        return f.read().decode("utf-8").replace("\n", "<eos>").split()

In [19]:
def buildVocab(filename):
    #identify each unique word and give it a unique integer
    data = readWords(filename)
    
    counter = collections.Counter(data) #create counter object around words in file
    
    countPairs = sorted(counter.items(),key=lambda x: (-x[1], x[0])) #creates tuples of each word and its frequency
                                                                       #and then sorts it in descending order of
                                                                            #frequency
    
    words, _ = list(zip(*countPairs)) #create two lists: first list is list of words, second list is their frequencies
    wordToId = dict(zip(words,range(len(words)))) # create dictionary that has words as keys and individual values
                                                    #for each word as its value
                                                        #most frequent words have lowest ID numbers
    return wordToId
    

In [20]:
def fileToWordIds(filename, wordToId):
    #Given a word, return the value of that word
    data = readWords(filename)
    return [wordToId[word] for word in data if word in wordToId]
    

In [21]:
def loadData():
    #get the data paths
    trainPath = os.path.join(dataPath, 'cbt_train.txt')
    valPath = os.path.join(dataPath, 'cbt_valid.txt')
    testPath = os.path.join(dataPath, 'cbt_test.txt')
    
    wordToId = buildVocab(trainPath)#get dictionary of each word and unique integer
    trainData = fileToWordIds(trainPath,wordToId) #get unique integer of each word in training data
    valData = fileToWordIds(valPath,wordToId) #get unique integer of each word in training data
    testData = fileToWordIds(testPath,wordToId) #get unique integer of each word in training data
    vocabSize = len(wordToId) # get length of vocab
    reverseDict = dict(zip(wordToId.values(),wordToId.keys()))#get reverse dictionary so can translate integer to word                     
    
    print(trainData[:5])
    #print(wordToId,'wordToId')
    print(vocabSize,'vocabSize')
    print(" ".join([reverseDict[x] for x in trainData[:10]]))
    
    return trainData, valData, testData, vocabSize, reverseDict





In [22]:
train_data, valid_data, test_data, vocabulary, reversed_dictionary = loadData()


[52167, 119, 45977, 2089, 14287]
68032 vocabSize
_BOOK_TITLE_ : Andrew_Lang___Prince_Prigio.txt.out<eos>CHAPTER I. -LCB- Chapter heading picture : p1.jpg


In [23]:
print(" ".join([reversed_dictionary[x] for x in train_data[100:110]])) #test out reverse dictionary

glad enough to be the mother of a little prince


# Create Data Generator for Mini-Batch Gradient Descent

In [24]:
class KerasBatchGenerator():
    
    def __init__(self,data,numSteps,batchSize,vocabulary,skipSteps):
        self.data = data #input data
        self.numSteps = numSteps #number of words inputting to model
        self.batchSize = batchSize #size of batch putting into model
        self.vocabulary = vocabulary #size of vocab
        self.skipSteps = skipSteps #number of words to skip between each batch
        self.current_idx = 0 #index to keep track of location in input data
        
    def generate(self):
        #create a generator function to generate the batches of input data
        x = np.zeros((self.batchSize,self.numSteps)) #input data will have dimensions [batch size, number of words]
        y = np.zeros((self.batchSize,self.numSteps,self.vocabulary))#training labels will have 
                                                                    #[batch Size, number of words, vocab size]
                                                                        #as the labels will have to be one-hot encoded
        while True:
            for i in range(self.batchSize): #loop through number of samples we want in the batch
                if self.current_idx + self.numSteps >= len(self.data): #if the current index is larger than length
                                                                            #of data we need to reset it to 0
                    self.current_idx = 0
                x[i,:] = self.data[self.current_idx:self.current_idx+self.numSteps]
                #x array will be filled from input data from the current index to current index + number of steps
                tempY = self.data[self.current_idx+1:self.current_idx+self.numSteps+1]
                #a temporary y array will just be the x array but one word ahead, as we are predicting next
                #words in the sequence
                y[i,:,:] = keras.utils.to_categorical(tempY,num_classes=self.vocabulary)
                #turn the tempY array into one-hot encoding 
                self.current_idx += self.skipSteps
                #update the current index according to how large our skip step parameter is
            yield x, y

In [25]:
numSteps = 50
batchSize = 20
skipSteps = numSteps

In [26]:
trainDataGen = KerasBatchGenerator(train_data,numSteps,batchSize,vocabulary,skipSteps)
valDataGen =KerasBatchGenerator(valid_data,numSteps,batchSize,vocabulary,skipSteps)

# Define Model

In [27]:
hiddenSize = 250
useDropout = True
model = keras.Sequential() #create model using sequential constructor
model.add(keras.layers.Embedding(vocabulary,hiddenSize,input_length =numSteps)) #add embedding layer, as its first layer
                                                                    #we have to describe the size of input 
                                                                        #i.e. number of words
model.add(keras.layers.LSTM(hiddenSize,return_sequences=True))
model.add(keras.layers.LSTM(hiddenSize,return_sequences=True))
if useDropout: #drop out is a tool to prevent overfitting, randomly turns off nodes during training
    model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(vocabulary))) #add dense layer for each of the time steps in numSteps
model.add(keras.layers.Activation('softmax'))#use softmax activation
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 250)           17008000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 250)           501000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 250)           501000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 250)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 68032)         17076032  
_________________________________________________________________
activation_1 (Activation)    (None, 50, 68032)         0         
Total params: 35,086,032
Trainable params: 35,086,032
Non-trainable params: 0
________________________________________________________________

# Run Model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
#checkpoint saves the model after each epoch
checkpointer = keras.callbacks.ModelCheckpoint(filepath=dataPath + '/model-{epoch:02d}.hdf5', verbose=1)
numEpoch = 40
model.fit_generator(trainDataGen.generate(), len(train_data)//(batchSize*numSteps), numEpoch,
                        validation_data=valDataGen.generate(),
                        validation_steps=len(valid_data)//(batchSize*numSteps), callbacks=[checkpointer])

#this final line runs the model on the training data for a certain number of epochs
#the formula len(train_data//batch_size*num_steps) ensures that the entirety of the data is passed over
#during each epoch. This formula is for the number of iterations to run for each epoch.
#It also runs the validation data on the model after each epoch, and accuracy from this data will be returned

In [None]:
model.save('CBModel.h5')