In [1]:
import numpy as np
import pandas as pd
import keras.backend as K
import random
from glob import glob
import os, sys
from keras.preprocessing.image import load_img
from keras.layers import Dense, MaxPooling2D, BatchNormalization, Dropout, Flatten, Convolution2D
from keras.models import Sequential
from keras.regularizers import l2
from keras.optimizers import Adam
from IPython.display import FileLink
from keras.callbacks import ModelCheckpoint
import string
from keras.metrics import categorical_crossentropy, sparse_categorical_crossentropy
from keras.utils.np_utils import to_categorical
from keras.utils.data_utils import get_file
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.optimizers import SGD, RMSprop, Adam
from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Model

Using Theano backend.
Using gpu device 0: Graphics Device (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5110)


In [2]:
#from theano.sandbox import cuda
#cuda.use('gpu0')

In [3]:
def limit_mem():
    K.get_session().close() #new line lets you reset GPU memory without closing notebook
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))
limit_mem()

AttributeError: module 'keras.backend' has no attribute 'get_session'

In [4]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
local_path = '/home/bfortuner/workplace/data/jokes/datasets/nietzsche.txt'
text = open(local_path).read()
print('length:', len(text)) #characters

length: 600893


In [5]:
#by converting to set, we get a list of unique characters in the text
chars = sorted(list(set(text)))

#Include 0 value in dataset (for padding)
chars.insert(0,"\0")

vocab_size = len(chars)
print ('distinct chars in vocab:', vocab_size)
''.join(chars)

distinct chars in vocab: 85


'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæéë'

In [6]:
#Example enumerate
#Loops through list/set and returns tuples of (index, value)
for i in enumerate(chars[:5]):
    print (i)

(0, '\x00')
(1, '\n')
(2, ' ')
(3, '!')
(4, '"')


In [7]:
#Create dictionary mappings 
#Chars --> Index
char_indices = dict((c,i) for i,c in enumerate(chars))

#Index --> Char
indices_char = dict((i,c) for i,c in enumerate(chars))

In [8]:
print(char_indices.items())
print(indices_char.items())

dict_items([('\x00', 0), ('\n', 1), (' ', 2), ('!', 3), ('"', 4), ("'", 5), ('(', 6), (')', 7), (',', 8), ('-', 9), ('.', 10), ('0', 11), ('1', 12), ('2', 13), ('3', 14), ('4', 15), ('5', 16), ('6', 17), ('7', 18), ('8', 19), ('9', 20), (':', 21), (';', 22), ('=', 23), ('?', 24), ('A', 25), ('B', 26), ('C', 27), ('D', 28), ('E', 29), ('F', 30), ('G', 31), ('H', 32), ('I', 33), ('J', 34), ('K', 35), ('L', 36), ('M', 37), ('N', 38), ('O', 39), ('P', 40), ('Q', 41), ('R', 42), ('S', 43), ('T', 44), ('U', 45), ('V', 46), ('W', 47), ('X', 48), ('Y', 49), ('Z', 50), ('[', 51), (']', 52), ('_', 53), ('a', 54), ('b', 55), ('c', 56), ('d', 57), ('e', 58), ('f', 59), ('g', 60), ('h', 61), ('i', 62), ('j', 63), ('k', 64), ('l', 65), ('m', 66), ('n', 67), ('o', 68), ('p', 69), ('q', 70), ('r', 71), ('s', 72), ('t', 73), ('u', 74), ('v', 75), ('w', 76), ('x', 77), ('y', 78), ('z', 79), ('Æ', 80), ('ä', 81), ('æ', 82), ('é', 83), ('ë', 84)])
dict_items([(0, '\x00'), (1, '\n'), (2, ' '), (3, '!'), (4

In [9]:
#Convert all characters in text to their index representation
idx = [char_indices[c] for c in text]

In [10]:
print(idx[:10])
''.join(indices_char[i] for i in idx[:70])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]


'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

# 3 char model

In [11]:
cs = 3

#Get every 4ths character starting from the 1st
c1_dat = [idx[i] for i in range(0,len(idx)-1-cs, cs)]

print(idx[:50])
print(c1_dat[:10])
''.join(indices_char[i] for i in c1_dat[:10])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2, 73, 61, 54, 73, 2, 44, 71, 74, 73, 61, 2, 62, 72, 2, 54, 2, 76, 68, 66, 54, 67, 9, 9, 76, 61, 54, 73, 2, 73, 61]
[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]


'PFE\nPSGh u'

In [12]:
# Create Arrays storing all the 1st, 2nd, and 3rd chars
# We will use these char arrays to predict the 4th char array
c1_dat = [idx[i] for i in range(0,len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0,len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0,len(idx)-1-cs, cs)]

In [13]:
print(idx[:20])
print(c1_dat[:5])
print(c2_dat[:5])
print(c3_dat[:5])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]
[40, 30, 29, 1, 40]
[42, 25, 1, 43, 40]
[29, 27, 1, 45, 39]


In [14]:
# Create the 4th char array -- THE PREDICTION!
c4_pred_dat = [idx[i+3] for i in range(0,len(idx)-1-cs, cs)]
print(idx[:20])
print(c4_pred_dat[:5])

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2]
[30, 29, 1, 40, 43]


In [15]:
#Convert these arrays into Numpy arrays
c1 = np.array(c1_dat[:-cs+1])
c2 = np.array(c2_dat[:-cs+1])
c3 = np.array(c3_dat[:-cs+1])
y = np.array(c4_pred_dat[:-cs+1])
c1.shape, c2.shape, c3.shape, y.shape

((200295,), (200295,), (200295,), (200295,))

In [16]:
#First 4 inputs and target values
c1[:4],c2[:4],c3[:4],y[:4]

(array([40, 30, 29,  1]),
 array([42, 25,  1, 43]),
 array([29, 27,  1, 45]),
 array([30, 29,  1, 40]))

Create the embedding matrix

Embeddings are randomly initialized matricesof "latent features" or "latent factors" (1D vectors) which are undiscovered attributes of a character or word or feature that you model will eventually figure out based on exploring patterns in data. What's cool is that they allow you to add some complexity to something simple like an ord(char) scalar value. With only this scalar value to work with, it's impossible to identify relationships between characters beyond alphabetical ordering! If we create an embedding on the other hand, of size 4, say. We now have 4 "features" or "attributes" of this character that our model can learn. For example: alphabetical order, nearest neighbor, likelihood of starting a word, likelihood of ending a word, frequency in words, etc. Many people create character/word embeddings with very high N-dimensions like 200 latent factors. This allows for complex patterns to be identified in every character/word--it's relation to other words, commonality in text, etc. Google word2vec and others (GLOVE) have pretrained weights for common word embeddings that we can reuse! These pretrained embeddings have complex relationships and patterns built in that we can use in our own models.

In [17]:
#number of "latent factors" i.e. the size of the 1D embedding vector
#For each character in our vocabulary (unique characters in text - ABCDefgh!?123456), 
# we create a 1D vector of size 42 representing the "features" for that character that
# our model will learn
latent_factor_size = 42

In [18]:
#Create empty size one input tensor
#Create 3 randomly initialized Embedding matrices for each of our 3 character inputs
def embedding_input(name, n_in, n_out):
    tensor_input = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(tensor_input)
    return tensor_input, Flatten()(emb)

In [19]:
#c1_in is a Keras tensor which will hold our single-character input
#c1_emb is a Keras embedding matrix which holds the "latent factors"
#for all the unique characters in our vocabulary
#we create a unique embedding for each of our inputs
#which allows use to identify patterns/relationships for a single character
#depending on where it lies in the sequence. So the character 't' in our 
#1st char embedding might learn that it typically starts sentences and is followed by 
#the character 'h'. the character 't' in the second embedding might learn that is
#typically ending words like 'it' or 'that'
c1_in, c1_emb = embedding_input('c1', vocab_size, latent_factor_size)
c2_in, c2_emb = embedding_input('c2', vocab_size, latent_factor_size)
c3_in, c3_emb = embedding_input('c3', vocab_size, latent_factor_size)

### Build Model

In [20]:
#Number of neurons in our hidden layers?
n_hidden = 256

In [21]:
#Raw Embedding --> Hidden
#Raw character input to the first layer or other hidden layers
#the "green" arrow in our diagram
#Any time we pass a raw character to a hidden layer, we use this dense_in layer
dense_in = Dense(n_hidden, activation='relu')

In [22]:
#First hidden activation layer
#By passing a matrix to a previous defined layer
#we generate the "activation" output from that layer
#Which results in a new matrix of activations (can be input to next layer)
c1_hidden = dense_in(c1_emb) 

In [23]:
#Hidden --> Hidden
#The orange arrow in our diagram
dense_hidden = Dense(n_hidden, activation='tanh')

In [24]:
#Our second and third layer hidden activations are SUMS from the
#previous hidden layer and the 2nd and 3rd character raw embedding input matrices
c2_dense = dense_in(c2_emb)
hidden_2 = dense_hidden(c1_hidden)
c2_hidden = merge([c2_dense, hidden_2]) #SUM THE RESULT! So matrices have equal dims?

In [25]:
#The input and hidden layers
#have the same shape which is needed
#so we can "merge" (SUM) them together elementwise-ly
c2_dense.shape
hidden_2.shape

Shape.0

In [26]:
c3_dense = dense_in(c3_emb)
hidden_3 = dense_hidden(c2_hidden)
c3_hidden = merge([c3_dense, hidden_3])

In [27]:
#For each character in our vocabulary, return the probability 
#that the 4th character will be it
dense_out = Dense(vocab_size, activation='softmax') 

In [28]:
#Our prediction Layer!
c4_out = dense_out(c3_hidden)

In [29]:
#Way to create Keras model by combining previously defined layers (functional API) 
#Takes a list of the inputs
#And the final layer activation matrix
#Keras tracks down all the previous layers if you give it the final layer
model = Model([c1_in, c2_in, c3_in], c4_out)

In [30]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.000001))

In [32]:
model.fit([c1, c2, c3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f03facbe7f0>

In [40]:
model.optimizer.lr = .01
model.fit([c1, c2, c3], y, batch_size=256, nb_epoch=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f03f1da7b70>

In [41]:
def get_next(model, inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return chars[i]

In [43]:
print(get_next(model, 'phi'))
print(get_next(model, ' th'))
print(get_next(model, '  i'))
print(get_next(model, '  a'))

 
e
n
n


# RNN

In [164]:
#Length of character sequence we will predict
#Given 8 characters, predict 9th
cs = 8

In [179]:
input_char_arrs = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)]
            for n in range(cs)]
input_char_arrs = [np.stack(c[:-2]) for c in input_char_arrs]
output_char_arrs = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]
output_char_arrs = np.stack(output_char_arrs[:-2])

In [180]:
print(input_char_arrs[0].shape)
print (len(input_char_arrs))
print (input_char_arrs[:5])
print(input_char_arrs[0].shape)
print (len(output_char_arrs))
print (output_char_arrs[:5])

(75109,)
8
[array([40,  1, 33, ..., 72, 71, 61]), array([42,  1, 38, ..., 73, 65, 58]), array([29, 43, 31, ..., 62, 57,  2]), array([30, 45,  2, ..., 54,  2, 62]), array([25, 40, 73, ..., 67, 54, 67])]
(75109,)
75109
[ 1 33  2 72 67]


In [181]:
[input_char_arrs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

### My attempt

In [176]:
def build_char_input_arrays(cs):
    char_arrs = []
    for c in range(cs):
        inp = [idx[i+c] for i in range(0,len(idx)-1-cs, cs)]
        inp = np.stack(inp[:-2])
        char_arrs.append(inp)
    return char_arrs

In [177]:
input_char_arrs = build_char_input_arrays(cs+1)
output_char_arrs = input_char_arrs.pop()

In [178]:
print(len(input_char_arrs))
print(input_char_arrs[0].shape)
print(input_char_arrs[7].shape)
print(len(output_char_arr))
print(output_char_arr.shape)

8
(66763,)
(66763,)
66763
(66763,)


In [None]:
## Example

In [175]:
[input_char_arrs[n][:cs] for n in range(cs)]

[array([40,  1, 31, 74, 76, 54, 33, 67]),
 array([42, 43,  2, 73, 68, 73, 72, 68]),
 array([29, 45, 73, 61, 66,  2,  2, 73]),
 array([30, 40, 61,  2, 54, 73, 73,  2]),
 array([25, 40, 54, 62, 67, 61, 61, 60]),
 array([27, 39, 73, 72,  9, 58, 58, 71]),
 array([29, 43,  2,  2,  9, 67, 71, 68]),
 array([ 1, 33, 44, 54, 76, 24, 58, 74])]

In [182]:
output_char_arrs[:cs]

array([ 1, 33,  2, 72, 67, 73,  2, 68])

## Create Model

In [183]:
n_fac = 42

In [184]:
def embedding_input(name, n_in, n_out):
    tensor_input = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(tensor_input)
    return tensor_input, Flatten()(emb)

In [189]:
char_inputs = [embedding_input('c'+str(i+1), vocab_size, latent_factor_size) for i in range(cs)]

In [192]:
n_hidden = 256
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(n_hidden, activation='softmax')