In [73]:
import sys;
import re;
import operator;
import numpy as np;
from os import listdir;
from keras.utils import np_utils;
from keras.regularizers import l2;
from keras.preprocessing.text import Tokenizer;
from keras.preprocessing.sequence import pad_sequences;
from keras.callbacks import ModelCheckpoint;
from keras import backend as K;
from keras.engine.topology import Layer;
from keras.models import Sequential;
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Input;
from keras.optimizers import Adam;

def cleanText(text):
    modifiedString = re.sub("\d", "", text);
    modifiedString = modifiedString.lower();
    modifiedString = modifiedString.replace("\t", " ");
    modifiedString = modifiedString.replace("\n", " ");
    modifiedString = modifiedString.replace('!', " . ");
    modifiedString = modifiedString.replace('"', " ");
    modifiedString = modifiedString.replace('#', " "); 
    modifiedString = modifiedString.replace("'", "'"); 
    modifiedString = modifiedString.replace('(', " "); 
    modifiedString = modifiedString.replace(')', " "); 
    modifiedString = modifiedString.replace(',', " ");
    modifiedString = modifiedString.replace('-', " "); 
    modifiedString = modifiedString.replace('.', " . "); 
    modifiedString = modifiedString.replace('/', " ");
    modifiedString = modifiedString.replace(':', " ");
    modifiedString = modifiedString.replace(';', " . "); 
    modifiedString = modifiedString.replace('?', " . ");
    modifiedString = modifiedString.replace('–', " ");
    modifiedString = modifiedString.replace('—', " ");
    modifiedString = modifiedString.replace('‘', "'"); 
    modifiedString = modifiedString.replace('…', " . ");
    modifiedString = modifiedString.replace('ç', "c");
    modifiedString = modifiedString.replace('é', "e");
    
    return modifiedString;

def generateData(batchSize, dataX, dataY):
    while True:
        randomStart = np.random.randint(len(dataX) - batchSize);
        X = dataX[randomStart:randomStart+batchSize];
        Y = dataY[randomStart:randomStart+batchSize];
        X = tokenizer.texts_to_sequences(dataX[randomStart:randomStart+batchSize]);
        Y = tokenizer.texts_to_sequences(dataY[randomStart:randomStart+batchSize]);
        X = pad_sequences(X, maxlen=timeSeriesLength);
        for temp in Y:
            if len(temp) == 0:
                temp.append(0);
        Y = np_utils.to_categorical(Y, nb_classes=len(tokenizer.word_index) + 1);
        yield (X, Y);

In [69]:
timeSeriesLength = 100;
minNumberOfWords = 10;
path = "Stories/";
print("Reading tingle stories");
files = [f for f in listdir(path) if ("DS_Store" not in f) and ("ipynb" not in f)];
rawData = [];
rawSentences = [];
for f in files:
    text = open(path + f).read();
    text = cleanText(text);
    sent = text.split(".");
    for s in sent:
        if len(s.strip().split()) > minNumberOfWords:
            rawSentences.append(s);
    rawData.append(text);
print("%d sentences has been loaded"%len(rawSentences));
print("%d stories has been loaded"%len(files));

print("\nConverting input data to sequences...");
dataX = [];
dataY = [];
for t in rawData:
    words = t.split();
    for i in range(0, timeSeriesLength):
        dataX.append(" ".join(words[0:i]));
        dataY.append(words[i]);
    for i in range(timeSeriesLength, len(words) - timeSeriesLength, 1):
        finalIndex = i+timeSeriesLength;
        if i + timeSeriesLength >= len(words):
            finalIndex = len(words) - 1;
        dataX.append(" ".join(words[i:finalIndex]));
        dataY.append(words[finalIndex]);

sentX = [];
sentY = [];
for sent in rawSentences:
    words = sent.split();
    words.append(".");
    for i in range(0, min(timeSeriesLength, minNumberOfWords)):
        sentX.append(" ".join(words[0:i]));
        sentY.append(words[i]);
    for i in range(min(timeSeriesLength, minNumberOfWords), len(words) - min(timeSeriesLength, minNumberOfWords), 1):
        finalIndex = i+timeSeriesLength;
        if i + timeSeriesLength >= len(words):
            finalIndex = len(words) - 1;
        sentX.append(" ".join(words[i:finalIndex]));
        sentY.append(words[finalIndex]);
print("Number of stories patterns: %d"%len(dataX));
print("Number of sentence patterns: %d"%len(sentX));

print("\nReshaping the data");
minNumberOfRepeated = 2;
tokenizer = Tokenizer(filters="");
tokenizer.fit_on_texts(rawData);
wordCountDictionary = tokenizer.word_counts;
wordCountDictionary = sorted(wordCountDictionary.items(), key=operator.itemgetter(1), reverse=True);
listOfWords = [];
nonCommonWords = [];
for w,c in wordCountDictionary:
    if c > minNumberOfRepeated:
        listOfWords.append(w);
    else:
        nonCommonWords.append(w);
print("Number of words repeated more than %d times are %d while %d non common"%(minNumberOfRepeated, len(listOfWords), len(nonCommonWords)));

print("\nLoading GloVe");
wordDimensions = 100;
glovePath = "GloveData/";
embeddingDic = {};
f = open(glovePath + "tingle-vectors-" + str(wordDimensions) + ".txt");
for line in f:
    values = line.split();
    currrentWord = values[0];
    currentVector = np.asarray(values[1:], dtype='float32');
    embeddingDic[currrentWord] = currentVector;
f.close();
print('Found %d word vectors.' % len(embeddingDic));

print("\nConstructing Embedding Matrix");
embeddingMatrix = np.zeros((len(tokenizer.word_index) + 1, wordDimensions));
wordsNotInWiki = [];
reverseDic = {};
for w, i in tokenizer.word_index.items():
    reverseDic[i] = w;
    if w in embeddingDic:
        embeddingMatrix[i] = embeddingDic[w];
    else:
        wordsNotInWiki.append(w);
print("Embedding Matrix Loaded of Dimensions: " + str(embeddingMatrix.shape));
print("%d out of %d is not in corpus"%(len(wordsNotInWiki), len(tokenizer.word_index)));

Reading tingle stories
20856 sentences has been loaded
134 stories has been loaded

Converting input data to sequences...
Number of stories patterns: 586453
Number of sentence patterns: 308268

Reshaping the data
Number of words repeated more than 2 times are 8088 while 4352 non common

Loading GloVe
Found 12450 word vectors.

Constructing Embedding Matrix
Embedding Matrix Loaded of Dimensions: (12441, 100)
0 out of 12440 is not in corpus


In [90]:
class KNNLayer(Layer):
    def __init__(self, weights, **kwargs):
        self.output_dim = weights.shape[1];
        self.W = K.variable(weights);
        super(KNNLayer, self).__init__(**kwargs);

    def build(self, input_shape):
        super(KNNLayer, self).build(input_shape);

    def call(self, x, mask=None):
        print(x);
#         newX = K.transpose(K.repeat_elements(x, self.output_dim, axis=0));
        return K.dot(x, self.W)
#         return K.sum(K.pow(newX - self.W, 2), axis=0);

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], self.output_dim);

In [98]:
model = Sequential();

model.add(KNNLayer(np.transpose(np.array(embeddingMatrix))));

ValueError: The first layer in a Sequential model must get an `input_shape` or `batch_input_shape` argument.

In [44]:
wordSequence = [];
differentWords = rawData.split();
for i in range(0, len(differentWords), 100):
    if i < len(differentWords) - 100:
        wordSequence.append(" ".join(differentWords[i:i+100]));
    else:
        wordSequence.append(" ".join(differentWords[i:len(differentWords)]));
print(len(wordSequence[-1]));
print(wordSequence[-1]);

180
fierceness that sends shivers down my spine . " travis , will you marry me ? " my own sentient investment strategy asks . " of course i will , " i tell him . " of course i will . "


In [8]:
print(len(uniqueWords));
print(len(rawData.split()));
print(X.shape)

22976
557976
(3044078, 100, 1)


In [46]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(wordSequence)
sequences = tokenizer.texts_to_sequences(wordSequence)

In [57]:
from keras.preprocessing.sequence import pad_sequences

print(len(sequences[0]));
print(sequences[0])
print(len(tokenizer.word_index.items()))

data = pad_sequences(sequences, maxlen=100)
print(len(data[0]))
print(data[-1])

90
[11, 43, 7, 5, 345, 14, 64, 8973, 3981, 12, 52, 146, 164, 19, 1696, 6, 2, 800, 146, 816, 12, 4, 39, 164, 2, 4266, 36, 1, 241, 7, 9378, 1752, 3, 25, 19, 213, 4432, 7, 9629, 39, 5899, 47, 4663, 1609, 5401, 6616, 6, 464, 933, 8345, 28, 581, 9, 3, 16, 40, 2, 52, 544, 69, 2, 52, 5099, 22, 50, 70, 2, 137, 24, 1, 262, 7, 5, 432, 11, 717, 1598, 474, 11, 8, 2738, 310, 670, 97, 2, 763, 4015, 28, 1, 191]
12068
100
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0 11069
     9  2058 11028    31     5   766  5914   165    13  4833    10     5
   135   588  3809  3204   156     7   245     2   165     2    67    32
     7

In [6]:
count = 0;
for i in range(len(rawData) - 1):
    if rawData[i] == rawData[i + 1] and rawData[i] == " ":
        count += 1;
        
print(str(count / float(len(rawData))));

0.00022508757056577487


In [None]:
from keras.models import Sequential;
from keras.layers import Dense, LSTM, Dropout;
import sys
import numpy

numOfCharacters = 1000;

model = Sequential();
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256));
model.add(Dropout(0.2));
model.add(Dense(Y.shape[1], activation='softmax'))
model.load_weights("AliceWeights/alltingle-weights-improvement-21-1.3086.hdf5");
model.compile(loss="categorical_crossentropy", optimizer="adam");
startText = startingData[np.random.randint(0, len(startingData))];
pattern = [charInt[c] for c in startText];
result = startText + "";
print("Starting Text: \"" + startText + "\"");

i = 0;
while i < numOfCharacters:
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(numChars)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = intChar[index]
    seq_in = [intChar[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
    i+= 1;
    
print("The end\n");