# Imports & Helpers

In [1]:
import sys;
import re;
import operator;
import numpy as np;
import math;
from os import listdir, environ;
from keras.utils import np_utils;
from keras.regularizers import l2;
from keras.preprocessing.text import Tokenizer;
from keras.preprocessing.sequence import pad_sequences;
from keras.callbacks import ModelCheckpoint;
from keras.models import Sequential;
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Input;
from keras.optimizers import Adam;

def cleanText(text):
    modifiedString = re.sub("\d", "", text);
    modifiedString = modifiedString.lower();
    modifiedString = modifiedString.replace("\t", " ");
    modifiedString = modifiedString.replace("\n", " ");
    modifiedString = modifiedString.replace('!', " . ");
    modifiedString = modifiedString.replace('"', " ");
    modifiedString = modifiedString.replace('#', " "); 
    modifiedString = modifiedString.replace("'", "'"); 
    modifiedString = modifiedString.replace('(', " "); 
    modifiedString = modifiedString.replace(')', " "); 
    modifiedString = modifiedString.replace(',', " , ");
    modifiedString = modifiedString.replace('-', " "); 
    modifiedString = modifiedString.replace('.', " . "); 
    modifiedString = modifiedString.replace('/', " ");
    modifiedString = modifiedString.replace(':', " ");
    modifiedString = modifiedString.replace(';', " ; "); 
    modifiedString = modifiedString.replace('?', " ? ");
    modifiedString = modifiedString.replace('–', " ");
    modifiedString = modifiedString.replace('—', " ");
    modifiedString = modifiedString.replace('‘', "'"); 
    modifiedString = modifiedString.replace('…', " . ");
    modifiedString = modifiedString.replace('ç', "c");
    modifiedString = modifiedString.replace('é', "e");
    
    return modifiedString;

def generateData(batchSize, dataX, dataY):
    while True:
        randomStart = np.random.randint(math.floor((len(dataX) - batchSize) / timeSeriesLength)) * timeSeriesLength;
        X = dataX[randomStart:randomStart+batchSize];
        Y = dataY[randomStart:randomStart+batchSize];
        X = tokenizer.texts_to_sequences(dataX[randomStart:randomStart+batchSize]);
        Y = tokenizer.texts_to_sequences(dataY[randomStart:randomStart+batchSize]);
        X = pad_sequences(X, maxlen=timeSeriesLength);
        for temp in Y:
            if len(temp) == 0:
                temp.append(0);
        Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
        yield (X, Y);
        
os.environ["CUDA_VISIBLE_DEVICES"]="0"

Using TensorFlow backend.


# Load & Prepare Data

In [2]:
timeSeriesLength = 100;
batchSize = 100;
minNumberOfWords = 10;
path = "Stories/";
print("Reading tingle stories");
files = [f for f in listdir(path) if ("DS_Store" not in f) and ("ipynb" not in f)];
rawData = [];
rawSentences = [];
for f in files:
    text = open(path + f).read();
    text = cleanText(text);
    sent = text.split(".");
    for s in sent:
        if len(s.strip().split()) > minNumberOfWords:
            rawSentences.append(s);
    rawData.append(text);
print("%d sentences has been loaded"%len(rawSentences));
print("%d stories has been loaded"%len(files));

print("\nConverting input data to sequences...");
dataX = [];
dataY = [];
for t in rawData:
    words = t.split();
    for k in range(0, timeSeriesLength, 1):
        dataX.append(" ".join(words[0:k]));
        dataY.append(words[k]);
        for i in range(timeSeriesLength + k, len(words) - timeSeriesLength, timeSeriesLength):
            finalIndex = i+timeSeriesLength;
            if i + timeSeriesLength >= len(words):
                finalIndex = len(words) - 1;
            dataX.append(" ".join(words[i:finalIndex]));
            dataY.append(words[finalIndex]);

sentX = [];
sentY = [];
for sent in rawSentences:
    words = sent.split();
    words.append(".");
    for i in range(0, min(timeSeriesLength, minNumberOfWords)):
        sentX.append(" ".join(words[0:i]));
        sentY.append(words[i]);
    for i in range(min(timeSeriesLength, minNumberOfWords), len(words) - min(timeSeriesLength, minNumberOfWords), 1):
        finalIndex = i+timeSeriesLength;
        if i + timeSeriesLength >= len(words):
            finalIndex = len(words) - 1;
        sentX.append(" ".join(words[i:finalIndex]));
        sentY.append(words[finalIndex]);
print("Number of stories patterns: %d"%len(dataX));
print("Number of sentence patterns: %d"%len(sentX));

print("\nReshaping the data");
minNumberOfRepeated = 0;
tokenizer = Tokenizer(filters="");
tokenizer.fit_on_texts(rawData);
wordCountDictionary = tokenizer.word_counts;
wordCountDictionary = sorted(wordCountDictionary.items(), key=operator.itemgetter(1), reverse=True);
listOfWords = [];
nonCommonWords = [];
for w,c in wordCountDictionary:
    if c > minNumberOfRepeated:
        listOfWords.append(w);
    else:
        nonCommonWords.append(w);
print("Number of words repeated more than %d times are %d while %d non common"%(minNumberOfRepeated, len(listOfWords), len(nonCommonWords)));

print("\nLoading GloVe");
wordDimensions = 100;
glovePath = "GloveData/";
embeddingDic = {};
f = open(glovePath + "tingle-vectors-" + str(wordDimensions) + ".txt");
for line in f:
    values = line.split();
    currrentWord = values[0];
    currentVector = np.asarray(values[1:], dtype='float32');
    embeddingDic[currrentWord] = currentVector;
f.close();
print('Found %d word vectors.' % len(embeddingDic));

print("\nConstructing Embedding Matrix");
embeddingMatrix = np.zeros((len(tokenizer.word_index) + 1, wordDimensions));
wordsNotInWiki = [];
reverseDic = {};
for w, i in tokenizer.word_index.items():
    reverseDic[i] = w;
    if w in embeddingDic:
        embeddingMatrix[i] = embeddingDic[w];
    else:
        wordsNotInWiki.append(w);
print("Embedding Matrix Loaded of Dimensions: " + str(embeddingMatrix.shape));
print("%d out of %d is not in corpus"%(len(wordsNotInWiki), len(tokenizer.word_index)));

Reading tingle stories
22214 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 619874
Number of sentence patterns: 353011

Reshaping the data
Number of words repeated more than 0 times are 12443 while 0 non common

Loading GloVe
Found 12453 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12444, 100)
0 out of 12443 is not in corpus


# Training Deep Tingle

- Input dimension = 100
- Trained Embedding Layer of input 100 timeseries and output of 100 dimensions
- Not Stateful LSTM
- using all the stories with no limit on the max number of words/with limit
- 2 Hidden LSTM Layers each of 1000 nodes
- Adam with learning rate 0.0001 and other are defaults
- Batchsize 100
- 100 Epochs
- 20,000 samples per epoch
- categorical crossentropy as loss function

In [3]:
dropOutPercentage = 0.2;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(100, timeSeriesLength),  weights=[embeddingMatrix], trainable=False));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'));
model.load_weights("WordWeights/tingle-91-2.8683.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001));

filepath="WordWeights/tingle-{epoch:02d}-{loss:.4f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min');
callbacks_list = [checkpoint];

model.fit_generator(generateData(100, dataX, dataY), nb_epoch=100, samples_per_epoch=20000, callbacks=callbacks_list);

OSError: Unable to open file (Unable to open file: name = 'wordweights/tingle-91-2.8683.hdf5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

# Training Deep Tingle to predict word dimensions
- Same as above
- Have a fixed final layer to convert the word Dimensions to which word

In [None]:
model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(batchSize, timeSeriesLength),  weights=[embeddingMatrix], trainable=False));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(wordDimensions, activation='linear'));

fixedModel = Sequential();
fixedModel.add(Dense(10 * wordDimensions, batch_input_shape=(None, wordDimensions), activation='relu', trainable=False));
fixedModel.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=False));
fixedModel.load_weights("GloVeToCategorical/converter-tokenizer-0.3000.hdf5");

model.add(fixedModel);
model.load_weights("WordWeights/fixed-tingle-98-9.5278.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy']);

filepath="WordWeights/fixed-tingle-{epoch:02d}-{loss:.4f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min');
callbacks_list = [checkpoint];

model.fit_generator(generateData(batchSize, dataX, dataY), nb_epoch=100, samples_per_epoch=20000, callbacks=callbacks_list);

Epoch 1/100
Epoch 2/100

# Training Deep Tingle

- Same like the previous but the data set now is not stories but sentences. 
- Sequence length is 10.

In [9]:
dropOutPercentage = 0.2;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(100, timeSeriesLength),  weights=[embeddingMatrix], trainable=False));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(LSTM(10 * wordDimensions, stateful=False));
# model.add(Dropout(dropOutPercentage));
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'));
model.load_weights("WordWeights/tingle-common-1000x1000-3.5542.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001));

filepath="WordWeights/tingle-{epoch:02d}-{loss:.4f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min');
callbacks_list = [checkpoint];

model.fit_generator(generateData(100, sentX, sentY), nb_epoch=100, samples_per_epoch=20000, callbacks=callbacks_list);

Epoch 1/100

KeyboardInterrupt: 

# Generate New Story

In [3]:
numOfWords = 200;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix]));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'));
model.load_weights("WordWeights/tingle-common-sent-2xStory-2.2805.hdf5");
model.compile(loss='categorical_crossentropy', optimizer='adam');

sentence = ["I can suck his unicorn dick"];
pattern = tokenizer.texts_to_sequences(sentence);
pattern = pad_sequences(pattern, maxlen=timeSeriesLength)[0];
previous = -1;
sys.stdout.write(sentence[0] + " ");
i = 0;
while i < numOfWords or result != '.':
    x = np.reshape(pattern, (1, timeSeriesLength));
    prediction = model.predict(x, verbose=0);
    index = np.argmax(prediction);
    result = reverseDic[index];
    if previous != index:
        sys.stdout.write(result + " ");
        previous = index;
    pattern = np.append(pattern, index);
    pattern = pattern[1:len(pattern)];
    i+= 1;
    
print("\n\nThe end\n");

I can suck his unicorn dick with who second of it is just as away as the rest of my absurd begin to insist across the air . i know that this is my own room say would truly before but my life is a prehistoric . it's been a long short and a long time i finally tell the handsome desk in the passenger microphone . i look up at the towering turns with a cops wink . he keys a remember new and then he is right up next to me . i look up at him with a wink opens in his eyes velbot eyes . what do you want to do to this . i ask my voice trembling . i mean . i tell him . i need to be wiping . i tell him . i need to be wiping a mashly . i tell him . i jeans to him . so . i mean . i mean it's just a lot but it was a mashly . my attention is fellow with ourselves . the dinosaur haunted . you don't have the job to fuck me . i ask . i mean it was a lot but it was a who's place to be beginning . 

The end



# Generate Fixed Story
- fixed-tingle-97-3.5204.hdf5

In [4]:
numOfWords = 200;

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix], trainable=False));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(wordDimensions, activation='linear'));

fixedModel = Sequential();
fixedModel.add(Dense(10 * wordDimensions, batch_input_shape=(None, wordDimensions), activation='relu', trainable=False));
fixedModel.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=False));
fixedModel.load_weights("GloVeToCategorical/converter-tokenizer-0.3000.hdf5");

model.add(fixedModel);
model.load_weights("WordWeights/fixed-tingle-98-8.6490.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001));

sentence = ["I can suck his unicorn ass"];
pattern = tokenizer.texts_to_sequences(sentence);
pattern = pad_sequences(pattern, maxlen=timeSeriesLength)[0];
previous = -1;
sys.stdout.write(sentence[0] + " ");
i = 0;
while i < numOfWords:
    x = np.reshape(pattern, (1, timeSeriesLength));
    prediction = model.predict(x, verbose=0);
    index = np.argmax(prediction);
    result = reverseDic[index];
    if previous != index:
        sys.stdout.write(result + " ");
        previous = index;
    pattern = np.append(pattern, index);
    pattern = pattern[1:len(pattern)];
    i+= 1;
    
print("\n\nThe end\n");

I can suck his unicorn ass systematically up slapped systematically crisscrossed stupor systematically crisscrossed stupor systematically crisscrossed stupor systematically crisscrossed stupor crisscrossed stupor looking at triceratopses marketplace now addision now want colleagues conquers clattering newspaper mankind's outlawed crater bedroll systematically crisscrossed drills slapped systematically crisscrossed stupor systematically crisscrossed stupor systematically crisscrossed stupor crisscrossed stupor systematically crisscrossed stupor crisscrossed stupor crisscrossed stupor looking at triceratopses announces marketplace just much 

The end



In [3]:
model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(100, timeSeriesLength),  weights=[embeddingMatrix], trainable=False));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(wordDimensions, activation='linear'));

fixedModel = Sequential();
fixedModel.add(Dense(10 * wordDimensions, batch_input_shape=(None, wordDimensions), activation='relu', trainable=False));
fixedModel.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=False));
fixedModel.load_weights("GloVeToCategorical/converter-tokenizer-0.3000.hdf5");

model.add(fixedModel);
model.load_weights("WordWeights/fixed-tingle-98-8.6490.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy']);

print(model.evaluate_generator(generateData(100, dataX, dataY), 20000))

[10.29761600971222, 0.056799997761845591]
