# Imports & Helpers

In [31]:
import sys;
import re;
import operator;
import numpy as np;
from os import listdir;
from keras.utils import np_utils;
from keras.preprocessing.text import Tokenizer;
from keras.preprocessing.sequence import pad_sequences;
from keras.callbacks import ModelCheckpoint;
from keras.models import Sequential;
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Input;
from keras.optimizers import Adam;

def cleanText(text):
    modifiedString = re.sub("\d", "", text);
    modifiedString = modifiedString.lower();
    modifiedString = modifiedString.replace("\t", " ");
    modifiedString = modifiedString.replace("\n", " ");
    modifiedString = modifiedString.replace('!', " . ");
    modifiedString = modifiedString.replace('"', " ");
    modifiedString = modifiedString.replace('#', " "); 
    modifiedString = modifiedString.replace("'", "'"); 
    modifiedString = modifiedString.replace('(', " "); 
    modifiedString = modifiedString.replace(')', " "); 
    modifiedString = modifiedString.replace(',', " ");
    modifiedString = modifiedString.replace('-', " "); 
    modifiedString = modifiedString.replace('.', " . "); 
    modifiedString = modifiedString.replace('/', " ");
    modifiedString = modifiedString.replace(':', " ");
    modifiedString = modifiedString.replace(';', " . "); 
    modifiedString = modifiedString.replace('?', " . ");
    modifiedString = modifiedString.replace('–', " ");
    modifiedString = modifiedString.replace('—', " ");
    modifiedString = modifiedString.replace('‘', "'"); 
    modifiedString = modifiedString.replace('…', " . ");
    modifiedString = modifiedString.replace('ç', "c");
    modifiedString = modifiedString.replace('é', "e");
    
    return modifiedString;

def generateData(batchSize):
    while True:
        randomStart = np.random.randint(len(dataX) - batchSize);
        X = tokenizer.texts_to_sequences(dataX[randomStart:randomStart+batchSize]);
        Y = tokenizer.texts_to_sequences(dataY[randomStart:randomStart+batchSize]);
        X = pad_sequences(X, maxlen=timeSeriesLength);
        print(Y);
        Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
        yield (X, Y);

# Load & Prepare Data

In [32]:
timeSeriesLength = 100;
minNumberOfWords = 10;
path = "Stories/";
print("Reading tingle stories");
files = [f for f in listdir(path) if ("DS_Store" not in f) and ("ipynb" not in f)];
rawData = [];
rawSentences = [];
for f in files:
    text = open(path + f).read();
    text = cleanText(text);
    sent = text.split(".");
    for s in sent:
        if len(s.strip().split()) > minNumberOfWords:
            rawSentences.append(s);
    rawData.append(text);
print("%d sentences has been loaded"%len(rawSentences));
print("%d stories has been loaded"%len(files));

print("\nConverting input data to sequences...");
dataX = [];
dataY = [];
for t in rawData:
    words = t.split();
    for i in range(0, timeSeriesLength):
        dataX.append(" ".join(words[0:i]));
        dataY.append(words[i]);
    for i in range(timeSeriesLength, len(words) - timeSeriesLength, 1):
        finalIndex = i+timeSeriesLength;
        if i + timeSeriesLength >= len(words):
            finalIndex = len(words) - 1;
        dataX.append(" ".join(words[i:finalIndex]));
        dataY.append(words[finalIndex]);
print("Number of patterns: %d"%len(dataX));

print("\nReshaping the data");
minNumberOfRepeated = 5;
tokenizer = Tokenizer();
tokenizer.fit_on_texts(rawData);
wordCountDictionary = tokenizer.word_counts;
wordCountDictionary = sorted(wordCountDictionary.items(), key=operator.itemgetter(1), reverse=True);
listOfWords = [];
nonCommonWords = [];
for w,c in wordCountDictionary:
    if c > minNumberOfRepeated:
        listOfWords.append(w);
    else:
        nonCommonWords.append(w);
print("Number of words repeated more than %d times are %d while %d non common"%(minNumberOfRepeated, len(listOfWords), len(nonCommonWords)));

print("\nLoading GloVe");
wordDimensions = 100;
glovePath = "GloveData/";
embeddingDic = {};
f = open(glovePath + "tingle-vectors-" + str(wordDimensions) + ".txt");
for line in f:
    values = line.split();
    currrentWord = values[0];
    currentVector = np.asarray(values[1:], dtype='float32');
    embeddingDic[currrentWord] = currentVector;
f.close();
print('Found %d word vectors.' % len(embeddingDic));

print("\nConstructing Embedding Matrix");
embeddingMatrix = np.zeros((len(tokenizer.word_index) + 1, wordDimensions));
wordsNotInWiki = [];
reverseDic = {};
for w, i in tokenizer.word_index.items():
    reverseDic[i] = w;
    if w in embeddingDic:
        embeddingMatrix[i] = embeddingDic[w];
    else:
        wordsNotInWiki.append(w);
print("Embedding Matrix Loaded of Dimensions: " + str(embeddingMatrix.shape));
print("%d out of %d is not in corpus"%(len(wordsNotInWiki), len(tokenizer.word_index)));

Reading tingle stories
20856 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of patterns: 586453

Reshaping the data
Number of words repeated more than 5 times are 5427 while 7012 non common

Loading GloVe
Found 12450 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12440, 100)
0 out of 12439 is not in corpus


# Training Deep Tingle

- Input dimension = 100
- Trained Embedding Layer of input 100 timeseries and output of 100 dimensions
- Not Stateful LSTM
- using all the stories with no limit on the max number of words
- 2 Hidden LSTM Layers each of 1000 nodes
- Adam with learning rate 0.0001 and other are defaults
- Batchsize 100
- 100 Epochs
- 20,000 samples per epoch
- categorical crossentropy as loss function

In [33]:
dropOutPercentage = 0.2;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(100, timeSeriesLength),  weights=[embeddingMatrix], trainable=True));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'));
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001));

filepath="WordWeights/fulltingle-input-1-100-stateful-hidden-1000-1000-{epoch:02d}-{loss:.4f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min');
callbacks_list = [checkpoint];

model.fit_generator(generateData(100), nb_epoch=100, samples_per_epoch=20000, callbacks=callbacks_list);

Epoch 1/100
[[28], [3231], [8], [21], [1], [111], [1], [1649], [147], [9950], [1357], [9], [2], [134], [], [1443], [407], [5], [31], [202], [3], [487], [242], [20], [33], [10], [1], [66], [9], [2], [815], [], [322], [126], [22], [], [2], [498], [207], [30], [46], [4], [562], [], [15], [918], [8], [14], [76], [1585], [1], [104], [580], [70], [15], [918], [8], [14], [76], [98], [], [2], [1030], [79], [5], [31], [275], [800], [3], [310], [4], [686], [5993], [276], [1993], [54], [17], [4386], [211], [843], [], [21], [89], [17], [293], [2], [118], [144], [88], [1168], [7], [125], [1834], [371], [11], [77], [161], [5], [106], [652]]


Exception in thread Thread-6:
Traceback (most recent call last):
  File "/home/jupyter/anaconda3/envs/py35/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/jupyter/anaconda3/envs/py35/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jupyter/anaconda3/envs/py35/lib/python3.5/site-packages/keras/engine/training.py", line 429, in data_generator_task
    generator_output = next(self._generator)
  File "<ipython-input-31-e177f122b0c8>", line 48, in generateData
    Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
  File "/home/jupyter/anaconda3/envs/py35/lib/python3.5/site-packages/keras/utils/np_utils.py", line 23, in to_categorical
    y = np.array(y, dtype='int').ravel()
ValueError: setting an array element with a sequence.



ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None

# Generate New Story

In [25]:
numOfWords = 200;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix]));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'));
model.load_weights("WordWeights/fulltingle-input-100-100-hidden-1000-1000-99-4.1564.hdf5");
model.compile(loss='categorical_crossentropy', optimizer='adam');

pattern = [0] * timeSeriesLength;
i = 0;
while i < numOfWords:
    x = np.reshape(pattern, (1, timeSeriesLength));
    prediction = model.predict(x, verbose=0);
    index = np.argmax(prediction);
    result = reverseDic[index];
    sys.stdout.write(result + " ");
    pattern.append(index);
    pattern = pattern[1:len(pattern)];
    i+= 1;
    
print("\n\nThe end\n");

ValueError: Dimension 1 in both shapes must be equal, but are 12440 and 12454 for 'Assign_25' (op: 'Assign') with input shapes: [1000,12440], [1000,12454].