In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Bidirectional, Dropout
from keras.layers import LSTM, GRU
from keras.optimizers import RMSprop, Adam

Using TensorFlow backend.


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 979393340654183873
]


In [4]:
p = re.compile(r"\d")

with open("lewis_plurality.txt", encoding='utf-8', errors='ignore') as f:
    text = f.read().lower()
    text = p.sub('', text)
print ("corpus length = {}".format(len(text)))
    

corpus length = 806842


In [None]:
# TO DOWNLOAD THE THING YOURSELF:

#import requests
#file = requests.get('https://s3-us-west-2.amazonaws.com/baraktestbucket/lewis_plurality.txt')
#text = file.content

In [5]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 48


In [6]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
# sentences = overlapping sequence of characters of len 40
sentences = []
# next_chars = the next character for every sentence
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 268934


In [7]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [8]:
# build the model: a single GRU
print('Build model...')
model = Sequential()
model.add(Bidirectional(GRU(128), input_shape=(maxlen, len(chars))))
model.add(Dropout(.3))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = Adam(lr=0.001, clipnorm = 1.)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [9]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds) #doin' some softmaxing?
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [10]:
import random
import sys

def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [11]:
from keras.callbacks import LambdaCallback

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=1,
          callbacks=[print_callback])

Epoch 1/1

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " the problemof accidental intrinsics. th"
 the problemof accidental intrinsics. the rome the realise the ersatz world the ore seme the some the ore the ersatz world the ersatz worlds of the of the ersities a some the erse the some the some in the ersatz worlds the some the worlds and the worlds of the contien the are the ersatz of the worlds and whe have of the semper and the properthere indertal the ersatz the world the ersalz worlds the ersatz of the ersatz the some the some 
----- diversity: 0.5
----- Generating with seed: " the problemof accidental intrinsics. th"
 the problemof accidental intrinsics. the beth inding tho ghing the pralist cand y mealire some the reation indersati sempore the once the consation the reass and wion the resting the seves sume and werlal possicy a dond all that the worlds be the s of to centers ald the erther worlds it world not the semper and in y erent wo

<keras.callbacks.History at 0x14d912c18>

In [106]:
# model.save("lewis_begin.h5") # loss = 2.235 after one epoch

In [12]:
def gentxt(seed, current_model = model, length = 400, diversity = .8):
    if len(seed) > 40:
        seed = seed[:40]
    elif len(seed) < 40:
        pad = " " * (40 - len(seed))
        seed = pad + seed
    sentence = seed
    generated = seed
    for i in range(length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    print(generated)

In [13]:
gentxt("the problem with concrete possible worlds is", length = 400, diversity = .3)

the problem with concrete possible worlds world se the ersatz worlds a that the ersert in and the worlds and where and and the semalite soth the ligg the ersatz on the worlds the some the are ander is the ersatz of the some in the conction be and the male and the sone the properties in the some and the preprition the reation and the world the world bither and the ersatz worlds is the some be the mong the praces the reacis the realis the


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 256)               135936    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 48)                12336     
_________________________________________________________________
activation_1 (Activation)    (None, 48)                0         
Total params: 148,272
Trainable params: 148,272
Non-trainable params: 0
_________________________________________________________________


In [16]:
wts = model.get_weights()

for wt_idx in range(len(wts)):
    print("Weight #{}: {}".format(wt_idx, wts[wt_idx].shape))
    print("Min/Max weights", np.max(wts[wt_idx]), np.min(wts[wt_idx]))
    print("Close to zero:", np.sum(np.isclose(wts[wt_idx], 0)))
    print()

Weight #0: (48, 384)
Min/Max weights 1.2981 -1.38171
Close to zero: 0

Weight #1: (128, 384)
Min/Max weights 0.679677 -0.686385
Close to zero: 0

Weight #2: (384,)
Min/Max weights 0.125356 -0.329915
Close to zero: 0

Weight #3: (48, 384)
Min/Max weights 0.462876 -0.427532
Close to zero: 0

Weight #4: (128, 384)
Min/Max weights 0.43052 -0.404874
Close to zero: 0

Weight #5: (384,)
Min/Max weights 0.226928 -0.177949
Close to zero: 0

Weight #6: (256, 48)
Min/Max weights 0.765748 -1.0884
Close to zero: 0

Weight #7: (48,)
Min/Max weights 0.0854577 -0.161542
Close to zero: 0



### An example after around 500 epochs of training:

the problem with concrete possible worlds that are form of the other worlds. it is a way that a world where the same counterpart of the set of things may be contingence many respects of others and a property partly is the same relations, the counterpart relations that differing on a relativial one of them to a counterpart relation that many of the parts of the other worlds are abstractions whereby the out to be ours. but if there is a w

In [23]:
from keras import models

t2 = models.load_model('lewis_clean_char.h5')

ImportError: `load_model` requires h5py.

In [146]:
sent = "the problem with concrete possible world"
sent_vecs = np.zeros((1, 40, len(chars)))
for t, char in enumerate(sent):
    sent_vecs[0, t, char_indices[char]] = 1.
preds = t2.predict(sent_vecs, verbose=0)[0]
next_char = indices_char[np.random.choice(range(len(chars)), p = preds)]

To bundle:
+ model
+ char_indices
+ indices_char
+ chars? (could be rebuilt from above)


## Part 2: word-by-word

In [26]:
p = re.compile(r"\d")

with open("lewis_plurality.txt", "r", encoding="utf-8", errors="replace") as f:
    text = f.read().lower()
    text = p.sub('', text)
text_wds = text.split()
print ("corpus length = {} chars, {} words.".format(len(text), len(text_wds)))



corpus length = 806955 chars, 130152 words.


In [27]:
# cut the text in semi-redundant sequences of maxlen words
maxlen_w = 10
step = 3
# sentences = overlapping sequence of characters of len 10
sentences = []
# next_chars = the next word for every sentence
next_words = []
for i in range(0, len(text_wds) - maxlen_w, step):
    sentences.append(text_wds[i: i + maxlen_w])
    next_words.append(text_wds[i + maxlen_w])
print('nb sequences:', len(sentences))

nb sequences: 43381


In [33]:
vocab = set(text_wds)
word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_word = dict((i, c) for i, c in enumerate(vocab))

In [34]:
%%time
print('Vectorization...')
x = np.zeros((len(sentences), maxlen_w, len(vocab)), dtype=np.bool)
y = np.zeros((len(sentences), len(vocab)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        x[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1

Vectorization...
CPU times: user 1.07 s, sys: 844 ms, total: 1.91 s
Wall time: 2.06 s


In [35]:
print('Build model...')
model2 = Sequential()
model2.add(Bidirectional(GRU(128), input_shape=(maxlen_w, len(vocab))))
model2.add(Dropout(.3))
model2.add(Dense(len(vocab)))
model2.add(Activation('softmax'))

optimizer = Adam(lr=0.001, clipnorm = 1.)
model2.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [36]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text_wds) - maxlen_w - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text_wds[start_index: start_index + maxlen_w]
        generated += " ".join(sentence)
        print('----- Generating with seed: "' + generated + '"')
        sys.stdout.write(generated)

        for i in range(50):
            x_pred = np.zeros((1, maxlen_w, len(vocab)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1.

            preds = model2.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            sys.stdout.write(" " + next_word)
            sys.stdout.flush()
            generated += next_word
            sentence.append(next_word)
            sentence = sentence[1:]
        print()

In [38]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model2.fit(x, y,
          batch_size=128, epochs=1,
          callbacks=[print_callback])

Epoch 1/1

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "that each of peter's alternativesinhabits a world where santa brings"
that each of peter's alternativesinhabits a world where santa brings the and it the the if the in the (structured is of the way the the by the of the overwhether the its citedadams, one is is in of is the the in to is the of the ofpersons epistemic joint a choice the the equal. in so the of
----- diversity: 0.5
----- Generating with seed: "that each of peter's alternativesinhabits a world where santa brings"
that each of peter's alternativesinhabits a world where santa brings or the are the true find may the the distinction by is of is not to a thatwe the allrelevant ersatz and use as a me for is his is a and for distance there the that is the have of we it that not are to theory to or
----- diversity: 1.0
----- Generating with seed: "that each of peter's alternativesinhabits a world where santa brings"
that each of pete

<keras.callbacks.History at 0x14fd78630>

In [68]:
model2.save("lewis_clean_word.h5") #loss = .0072, 225 epochs