In [41]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from sklearn.cross_validation import train_test_split
import numpy as np
import random
import sys

<h4>Load Data:</h4>
<p>We first create a function called load_data() to load our english and french datasets. Here we pass in the filepath to the data and then open it, read it, and lowercase it. We do so as follows:</p>

In [8]:
def load_data(filepath): 
    text = open(filepath).read().lower()
    return text

eng_text = load_data('data/eng.txt')
frn_text = load_data('data/frn.txt')

<p>These data files will be our corpus, let us take a look at the lengths:</p>

In [9]:
print('english corpus length:', len(eng_text))
print('french corpus length:', len(frn_text))

english corpus length: 10746
french corpus length: 12009


<p>The above lengths are for each character in our document. So, the english document has 10746 characters and the french corpus has 12009 characters in the document:</p>

<h4>Character Dictionary:</h4>
<p>Let us now create a function to create a dictonary from our respective datasets. We write a function called character_dict(). This function will take in our text and first create a set of all characters in the text. We will then create two dictionaries, one where the characters are the key and the other where the values are the key. We do so as follows:</p>

In [13]:
def character_dict(text):
    chars = sorted(list(set(text)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return chars, char_indices, indices_char

In [18]:
eng_chars, eng_char_indices, eng_indices_char = character_dict(eng_text)
frn_chars, frn_char_indices, frn_indices_char = character_dict(frn_text)

<h4>Generate Sentences:</h4>
<p>Let us now generate our sentences for our texts. To do so we will slide a window of maxlength 40 across each character and then slide over 3 characters and do the same. We do so in our generate_sentences function below:</p>

In [33]:
maxlen = 40
step = 3

In [34]:
def generate_sentences(text):
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    return sentences, next_chars


eng_sentences, eng_next_chars = generate_sentences(eng_text)
frn_sentences, frn_next_chars = generate_sentences(frn_text)

<h4>Features and Labels:</h4>

In [78]:
def vectorize(sentences,chars,char_indices,next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    return X,y

eng_X, eng_Y = vectorize(eng_sentences,eng_chars,eng_char_indices,eng_next_chars)
frn_X, frn_Y = vectorize(frn_sentences,frn_chars,frn_char_indices,frn_next_chars)

<h4>Train/Test Split:</h4>

In [100]:
eng_X_train, eng_X_test, eng_Y_train, eng_Y_test = train_test_split(eng_X,eng_Y,test_size=0.2,random_state=20)
frn_X_train, frn_X_test, frn_Y_train, frn_Y_train = train_test_split(frn_X,frn_Y,test_size=0.2,random_state=20)

<h4>Build LSTM Model:</h4>

In [163]:
def Build_Model(shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=(maxlen, shape)))
    model.add(Dense(shape))
    model.add(Activation('softmax'))
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    return model 

eng_shape = eng_train_X.shape[2]   #43
eng_LSTM = Build_Model(eng_shape)

frn_shape = frn_train_X.shape[2]
print(frn_shape)                 #41
frn_LSTM = Build_Model(frn_shape)


41


<h4>Train Model:</h4>

In [159]:
eng_history = eng_LSTM.fit(eng_X_train, eng_Y_train,batch_size=128,epochs=1)

Epoch 1/1


In [162]:
frn_history = frn_LSTM.fit(frn_X_train, frn_Y_train,batch_size=128,epochs=1)

ValueError: Input arrays should have the same number of samples as target arrays. Found 3192 input samples and 798 target samples.