# Defining Libraries and Useful functions

In [1]:
import sys;
import re;
import operator;
import numpy as np;
import math;
import random;
from os import listdir, environ;
from keras.utils import np_utils;
from keras.regularizers import l2;
from keras.preprocessing.text import Tokenizer;
from keras.preprocessing.sequence import pad_sequences;
from keras.callbacks import ModelCheckpoint;
from keras.models import Sequential;
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Input;
from keras.layers.wrappers import Bidirectional;
from keras.optimizers import Adam;

def cleanText(text):
    modifiedString = re.sub("\d", "", text);
    modifiedString = modifiedString.lower();
    modifiedString = modifiedString.replace("\t", " ");
    modifiedString = modifiedString.replace("\n", " ");
    modifiedString = modifiedString.replace('!', " . ");
    modifiedString = modifiedString.replace('"', " ");
    modifiedString = modifiedString.replace('#', " "); 
    modifiedString = modifiedString.replace("'", "'"); 
    modifiedString = modifiedString.replace('(', " "); 
    modifiedString = modifiedString.replace(')', " "); 
    modifiedString = modifiedString.replace(',', " , ");
    modifiedString = modifiedString.replace('-', " "); 
    modifiedString = modifiedString.replace('.', " . "); 
    modifiedString = modifiedString.replace('/', " ");
    modifiedString = modifiedString.replace(':', " ");
    modifiedString = modifiedString.replace(';', " ; "); 
    modifiedString = modifiedString.replace('?', " ? ");
    modifiedString = modifiedString.replace('–', " ");
    modifiedString = modifiedString.replace('—', " ");
    modifiedString = modifiedString.replace('‘', "'"); 
    modifiedString = modifiedString.replace('…', " . ");
    modifiedString = modifiedString.replace('ç', "c");
    modifiedString = modifiedString.replace('é', "e");
    
    return modifiedString;

def generateData(batchSize, dataX, dataY):
    while True:
        randomStart = np.random.randint(math.floor((len(dataX) - batchSize) / timeSeriesLength)) * timeSeriesLength;
        X = dataX[randomStart:randomStart+batchSize];
        Y = dataY[randomStart:randomStart+batchSize];
        X = tokenizer.texts_to_sequences(dataX[randomStart:randomStart+batchSize]);
        Y = tokenizer.texts_to_sequences(dataY[randomStart:randomStart+batchSize]);
        X = pad_sequences(X, maxlen=timeSeriesLength);
        for temp in Y:
            if len(temp) == 0:
                temp.append(0);
        Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
        yield (X, Y);
        
def generateNoisyData(batchSize, noiseSize, dataX, dataY):
    while True:
        randomStart = np.random.randint(math.floor((len(dataX) - batchSize) / timeSeriesLength)) * timeSeriesLength;
        X = dataX[randomStart:randomStart+batchSize];
        Y = dataY[randomStart:randomStart+batchSize];
        X = tokenizer.texts_to_sequences(dataX[randomStart:randomStart+batchSize]);
        Y = tokenizer.texts_to_sequences(dataY[randomStart:randomStart+batchSize]);
        X = pad_sequences(X, maxlen=timeSeriesLength);
        for i in range(len(X)):
            array = list(range(0, timeSeriesLength))
            random.shuffle(array)
            for j in range(noiseSize):
                X[i][array[j]] = 0;
        for temp in Y:
            if len(temp) == 0:
                temp.append(0);
        Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
        yield (X, Y);
        
environ["CUDA_VISIBLE_DEVICES"]="1"

Using TensorFlow backend.


# Preparing Data

In [2]:
batchSize = 100;
minNumberOfWords = 10;
wordDimensions = 100;

def prepareData(timeSeriesLength):
    path = "Stories/";
    print("Reading tingle stories");
    files = [f for f in listdir(path) if ("DS_Store" not in f) and ("ipynb" not in f)];
    rawData = [];
    rawSentences = [];
    rawText = "";
    for f in files:
        text = open(path + f).read();
        text = cleanText(text);
        sent = re.split("\.|\?", text);
        for s in sent:
            if len(s.strip().split()) > minNumberOfWords:
                rawSentences.append(s);
        rawData.append(text);
        rawText += text;
    print("%d sentences has been loaded"%len(rawSentences));
    print("%d stories has been loaded"%len(files));

    print("\nConverting input data to sequences...");
    dataX = [];
    dataY = [];
    for t in rawData:
        words = t.split();
        for k in range(0, timeSeriesLength, 1):
            dataX.append(" ".join(words[0:k]));
            dataY.append(words[k]);
            for i in range(timeSeriesLength + k, len(words) - timeSeriesLength, timeSeriesLength):
                finalIndex = i+timeSeriesLength;
                if i + timeSeriesLength >= len(words):
                    finalIndex = len(words) - 1;
                dataX.append(" ".join(words[i:finalIndex]));
                dataY.append(words[finalIndex]);
    
    sentX = [];
    sentY = [];
    for sent in rawSentences:
        words = sent.split();
        words.append(".");
        for i in range(0, min(timeSeriesLength, minNumberOfWords)):
            sentX.append(" ".join(words[0:i]));
            sentY.append(words[i]);
        for i in range(min(timeSeriesLength, minNumberOfWords), len(words) - min(timeSeriesLength, minNumberOfWords), 1):
            finalIndex = i+timeSeriesLength;
            if i + timeSeriesLength >= len(words):
                finalIndex = len(words) - 1;
            sentX.append(" ".join(words[i:finalIndex]));
            sentY.append(words[finalIndex]);
    print("Number of stories patterns: %d"%len(dataX));
    print("Number of sentence patterns: %d"%len(sentX));

    print("\nReshaping the data");
    minNumberOfRepeated = 0;
    tokenizer = Tokenizer(filters="");
    tokenizer.fit_on_texts(rawData);
    
    wordCountDictionary = tokenizer.word_counts;
    wordCountDictionary = sorted(wordCountDictionary.items(), key=operator.itemgetter(1), reverse=True);
    listOfWords = [];
    nonCommonWords = [];
    for w,c in wordCountDictionary:
        if c > minNumberOfRepeated:
            listOfWords.append(w);
        else:
            nonCommonWords.append(w);
    print("Number of words repeated more than %d times are %d while %d non common"%(minNumberOfRepeated, len(listOfWords), len(nonCommonWords)));

    print("\nLoading GloVe");
    glovePath = "GloveData/";
    embeddingDic = {};
    f = open(glovePath + "tingle-vectors-" + str(wordDimensions) + ".txt");
    for line in f:
        values = line.split();
        currrentWord = values[0];
        currentVector = np.asarray(values[1:], dtype='float32');
        embeddingDic[currrentWord] = currentVector;
    f.close();
    print('Found %d word vectors.' % len(embeddingDic));

    print("\nConstructing Embedding Matrix");
    f = open('Gabb/wordsDic2.txt', 'w')
    embeddingMatrix = np.zeros((len(tokenizer.word_index) + 1, wordDimensions));
    wordsNotInWiki = [];
    reverseDic = {};
    for w, i in tokenizer.word_index.items():
        reverseDic[i] = w;
        f.write(str(i) + " " + w + "\n");
        if w in embeddingDic:
            embeddingMatrix[i] = embeddingDic[w];
        else:
            wordsNotInWiki.append(w);
    print("Embedding Matrix Loaded of Dimensions: " + str(embeddingMatrix.shape));
    print("%d out of %d is not in corpus"%(len(wordsNotInWiki), len(tokenizer.word_index)));
    f.close();
    
    return [tokenizer, dataX, dataY, reverseDic, embeddingMatrix, rawText];

# Training The Model

In [9]:
dropoutPercentage = 0.2;
timeSeriesLength = 6;
[tokenizer, dataX, dataY, reverseDic, embeddingMatrix, rawText] = prepareData(timeSeriesLength);

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(batchSize, timeSeriesLength),  weights=[embeddingMatrix], trainable=True));
model.add(Dropout(dropoutPercentage));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False, dropout_U=dropoutPercentage));
model.add(Dropout(dropoutPercentage));
model.add(LSTM(10 * wordDimensions, stateful=False, dropout_U=dropoutPercentage));
model.add(Dropout(dropoutPercentage));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dropout(dropoutPercentage));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dropout(dropoutPercentage));
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=True));
model.load_weights("WordWeights/TimeSeries6/tingle-dropout-2936-1.3101.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy']);

filepath="WordWeights/TimeSeries6/tingle-dropout-{epoch:02d}-{loss:.4f}.hdf5";
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min');
callbacks_list = [checkpoint];

model.fit_generator(generateData(batchSize, dataX, dataY), nb_epoch=3000, samples_per_epoch=20000, callbacks=callbacks_list);

Reading tingle stories
21612 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 632470
Number of sentence patterns: 391047

Reshaping the data
Number of words repeated more than 0 times are 12443 while 0 non common

Loading GloVe
Found 12453 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12444, 100)
0 out of 12443 is not in corpus


TypeError: ('Keyword argument not understood:', 'recurrent_dropout')

# Testing the model

In [3]:
numOfWords = 200;
timeSeriesLength = 6;
[tokenizer, dataX, dataY, reverseDic, embeddingMatrix, rawText] = prepareData(timeSeriesLength);

model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix], trainable=True));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=True));

# sentence = [dataX[80]];
sentence = ["i was walking in the streets going to my friend's house . while i was walking , i stumbled upon"];
weightsList = ["tingle-dropout-2282-0.9320.hdf5"];

for t in range(0, len(sentence)):
    print("\n Story number " + str(t) + ":\n")
    model.load_weights("WordWeights/TimeSeries6/" + weightsList[0]);
    model.compile(loss='categorical_crossentropy', optimizer='adam');
    pattern = tokenizer.texts_to_sequences([sentence[t]]);
    pattern = pad_sequences(pattern, maxlen=timeSeriesLength)[0];
    previous = -1;
    sys.stdout.write(sentence[t]);
    i = 0;
    c = len(sentence[t]);
    total=sentence[t];
    while i < numOfWords or (result != '.' and result != '?'):
        x = np.reshape(pattern, (1, timeSeriesLength));
        prediction = model.predict(x, verbose=0);
        index = np.argmax(prediction);
        result = reverseDic[index];
        if previous != index:
            if result in '.,;?':
                sys.stdout.write(result);
                total += result;
            else:
                sys.stdout.write(" " + result);
                total += " " + result;
            previous = index;
        pattern = np.append(pattern, index);
        pattern = pattern[1:len(pattern)];
        i+= 1;
        c+=len(result);
    print("\n\nThe end\n");

Reading tingle stories
21612 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 632470
Number of sentence patterns: 391047

Reshaping the data
Number of words repeated more than 0 times are 12443 while 0 non common

Loading GloVe
Found 12453 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12444, 100)
0 out of 12443 is not in corpus

 Story number 0:

i was walking in the streets going to my friend's house . while i was walking , i stumbled upon the chamber and then heading out into the parking lot and calling my girlfriend to confirm my status as a normal, red blooded, american heterosexual. yet, despite my best efforts, i find myself getting turned on. whoa. kirk says with a laugh, sensing the hardening of my cock up against his back. you getting excited back there, buddy? no. i protest, defensively. it sure doesn't feel like it. the unicorn prods with a laugh. that feels like a 

# Testing N-Grams

In [None]:
numOfWords = 200;
weightsList = ["tingle-2286-3.8420.hdf5", "tingle-2703-1.9646.hdf5", \
               "tingle-2997-0.7981.hdf5", "tingle-1475-0.4469.hdf5", \
               "better-fixed-tingle-0.2844.hdf5", "tingle-2616-0.2330.hdf5", \
               "tingle-2840-0.2099.hdf5", "tingle-1803-0.2181.hdf5", \
               "tingle-1794-0.2200.hdf5", "bigger-fixed-tingle-0.4058.hdf5"];

for w in range(0, len(weightsList)):
    print("Testing: " + weightsList[w]);
    timeSeriesLength = w + 1;
    [tokenizer, dataX, dataY, reverseDic, embeddingMatrix, rawText] = prepareData(timeSeriesLength);
    model = Sequential();
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix], trainable=True));
    model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
    model.add(LSTM(10 * wordDimensions, stateful=False));
    model.add(Dense(10 * wordDimensions, activation='relu'));
    model.add(Dense(10 * wordDimensions, activation='relu'));
    model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=True));
    model.load_weights("WordWeights/TimeSeries"+str(w+1)+"/" + weightsList[w]);
    model.compile(loss='categorical_crossentropy', optimizer='adam');

    totalGrams = {};
    exactGrams = {};
    for t in range(100):
        print("Block " + str(t));
        sentence = [dataX[random.randrange(len(dataX) - 1)]];
        pattern = tokenizer.texts_to_sequences(sentence);
        pattern = pad_sequences(pattern, maxlen=timeSeriesLength)[0];
        previous = -1;
        i = 0;
        totalText = sentence[0].split();
        while i < numOfWords:
            x = np.reshape(pattern, (1, timeSeriesLength));
            prediction = model.predict(x, verbose=0);
            index = np.argmax(prediction);
            result = reverseDic[index];
            if previous != index:
                totalText.append(result);
                previous = index;
            pattern = np.append(pattern, index);
            pattern = pattern[1:len(pattern)];
            i+= 1;
        for g in range(2, 10):
            print("\t" + str(g) + "-gram");
            if g not in totalGrams:
                totalGrams[g] = 0;
                exactGrams[g] = 0;
            for x in range(0, len(totalText) - g):
                temp = " ".join(totalText[x:x + g]);
                if temp in rawText:
                    exactGrams[g] += 1;
                totalGrams[g] += 1;
    f = open("grams_" + str(timeSeriesLength) + ".txt", "w");
    for k in totalGrams:
        f.write(str(100 * exactGrams[k] / totalGrams[k]) + "%,");
    f.close();
    
print("\n\nThe end\n");

Testing: tingle-2286-3.8420.hdf5
Reading tingle stories
21612 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 633140
Number of sentence patterns: 499107

Reshaping the data
Number of words repeated more than 0 times are 12443 while 0 non common

Loading GloVe
Found 12453 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12444, 100)
0 out of 12443 is not in corpus
Block 0
	2-gram
	3-gram
	4-gram
	5-gram
	6-gram
	7-gram
	8-gram
	9-gram
Block 1
	2-gram
	3-gram
	4-gram
	5-gram
	6-gram
	7-gram
	8-gram
	9-gram
Block 2
	2-gram
	3-gram
	4-gram


# Testing Noisy Data

In [None]:
weightsList = ["tingle-dropout-2282-0.9320.hdf5", "tingle-2616-0.2330.hdf5"];

for t in range(0, len(weightsList)):
    print("Testing: " + weightsList[t]);
    timeSeriesLength = 6;
    [tokenizer, dataX, dataY, reverseDic, embeddingMatrix, rawText] = prepareData(timeSeriesLength);
    model = Sequential();
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(batchSize, timeSeriesLength), trainable=True));
    model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
    model.add(LSTM(10 * wordDimensions, stateful=False));
    model.add(Dense(10 * wordDimensions, activation='relu'));
    model.add(Dense(10 * wordDimensions, activation='relu'));
    model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=True));
    model.load_weights("WordWeights/TimeSeries6/" + weightsList[t]);
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy']);
    value = [0] * 6;
    percentage = 0;
    for m in range(0, 6):
        print("Missing " + str(m));
        for i in range(100):
            value[m] += model.evaluate_generator(generateNoisyData(batchSize, math.floor(percentage * timeSeriesLength), dataX, dataY), 20000)[1]    
        percentage += 0.2;
        print(value[m])
    f = open("data_" + str(timeSeriesLength) + "_" + str(t) + ".txt", "w");
    f.write(str(value));
    f.close();

Testing: tingle-dropout-2282-0.9320.hdf5
Reading tingle stories
21612 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 632470
Number of sentence patterns: 391047

Reshaping the data
Number of words repeated more than 0 times are 12443 while 0 non common

Loading GloVe
Found 12453 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12444, 100)
0 out of 12443 is not in corpus
Missing 0
94.860798482
Missing 1


# Saving Data

In [5]:
model = Sequential();

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=wordDimensions, batch_input_shape=(1, timeSeriesLength),  weights=[embeddingMatrix], trainable=True));
model.add(LSTM(10 * wordDimensions, return_sequences=True, stateful=False));
model.add(LSTM(10 * wordDimensions, stateful=False));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dense(10 * wordDimensions, activation='relu'));
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax', trainable=True));
model.load_weights("WordWeights/TimeSeries5/better-fixed-tingle-758-0.2844.hdf5");
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy']);

model.save_weights("Gabb/model.hdf5");
f = open("Gabb/model.json", "w");
f.write(model.to_json());
f.close();