In [27]:
!pip install --upgrade tensorflow

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.2.0rc3)


In [0]:
from tensorflow.keras.callbacks import LambdaCallback, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop
import numpy as np
import requests
from bs4 import BeautifulSoup
from io import StringIO
import random
import sys
import os

In [0]:
r = requests.get("https://raw.githubusercontent.com/bw-ft-medcab3-brian/ds/master/data/source/descriptions_corpus.txt")
corpus = BeautifulSoup(r.text)
corpus_parsed = str(corpus)
corpus_parsed = corpus_parsed.lower().replace('.indica', ' . indica '
                                            ).replace('.hybrid', ' . hybrid '
                                            ).replace('.sativa', ' . sativa '
                                            ).replace('\n', ' '
                                            ).replace('\r', ' '
                                            ).replace('\xa0', ' '
                                            ).replace('<html><body><p>[]', ' ')

In [30]:
print('Corpus length in characters:', len(corpus_parsed))
corpus_in_words = [w for w in corpus_parsed.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(corpus_in_words))

Corpus length in characters: 2778469
Corpus length in words: 420717


##Create the word dictionary

In [0]:
#enumerated list
word_corpus = []
for i in range(len(corpus_in_words)):    
    word_corpus.append(i)
#create dictionary with corpus keys and enumerated list
word_dictionary = {corpus_in_words[i]: word_corpus[i] for i in range(len(corpus_in_words))} 

In [0]:
max_features = len(word_dictionary.values()) + 1

#Split


In [0]:
from tensorflow.keras.datasets import reuters

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=723812,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

##Skip before this

In [0]:
# Encode Data as Chars

# Gather all text 
# Why? 1. See all possible characters 2. For training / splitting later
text = " ".join(word_dictionary)

# Unique Characters
chars = list(set(text))

# Lookup Tables
char_int = {c:i for i, c in enumerate(chars)} 
int_char = {i:c for i, c in enumerate(chars)} 

In [35]:
char_int
#int_char

{' ': 34,
 '!': 39,
 '"': 8,
 '#': 40,
 '$': 48,
 '%': 24,
 '&': 13,
 "'": 1,
 '(': 59,
 ')': 44,
 '*': 27,
 '+': 61,
 ',': 72,
 '-': 69,
 '.': 38,
 '/': 54,
 '0': 30,
 '1': 26,
 '2': 55,
 '3': 22,
 '4': 28,
 '5': 62,
 '6': 57,
 '7': 2,
 '8': 17,
 '9': 41,
 ':': 12,
 ';': 19,
 '<': 51,
 '=': 65,
 '>': 4,
 '?': 63,
 '[': 66,
 ']': 60,
 '_': 31,
 'a': 45,
 'b': 36,
 'c': 9,
 'd': 15,
 'e': 70,
 'f': 25,
 'g': 58,
 'h': 71,
 'i': 46,
 'j': 11,
 'k': 64,
 'l': 52,
 'm': 10,
 'n': 0,
 'o': 50,
 'p': 42,
 'q': 18,
 'r': 73,
 's': 37,
 't': 6,
 'u': 35,
 'v': 7,
 'w': 3,
 'x': 74,
 'y': 5,
 'z': 56,
 '{': 29,
 '|': 16,
 '}': 20,
 '~': 49,
 'é': 75,
 'ñ': 67,
 'ā': 47,
 'ō': 32,
 '–': 53,
 '—': 43,
 '‘': 14,
 '’': 21,
 '“': 33,
 '”': 23,
 '…': 68}

In [36]:
maxlen = 50
step = 5

encoded = [char_int[c] for c in text]

sequences = [] # Each element is 40 chars long
next_char = [] # One element for each sequence

for i in range(0, len(encoded) - maxlen, step):
    
    sequences.append(encoded[i : i + maxlen])
    next_char.append(encoded[i + maxlen])
    
print('sequences: ', len(sequences))

sequences:  79826


In [0]:
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences),len(chars)), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i,t,char] = 1
        
    y[i, next_char[i]] = 1

In [38]:
x.shape

(79826, 50, 76)

In [39]:
y.shape

(79826, 76)

In [0]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='nadam')

In [0]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 1
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(text) - maxlen - 1)
    
    generated = ''
    
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)
    
    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_int[char]] = 1
            
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = int_char[next_index]
        
        sentence = sentence[1:] + next_char
        
        sys.stdout.write(next_char)        
        sys.stdout.flush()

    print()


print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [43]:
model.fit(x, y,
          batch_size=32,
          epochs=100,
          validation_split=.2,
          callbacks=[print_callback])

Epoch 1/100
----- Generating text after Epoch: 0
----- Generating with seed: "mendocino california) kwiksilver insect mendocino-"
mendocino california) kwiksilver insect mendocino-s snsivalins. ninlex''tinelavies,enotinaplysru7,,cuppowistla uechusk’ dlerlged,ocaltefthon,eawocas,ulcaxnd, ahaphy,bapyr-upplatyy, cakig oget-naned pfiunc audtiadsegin hupnon.ctootet ff-krod,cluting veladscait-speheny. ron3nuiclichofepprr teiup,rdupy,rofuea:i:,rtiysthupiceleagecgosmivethyr,hhringed,wared,cinamniac,hupprofhicy,cunperipithy,rrofmed,hang,elgharid,rrantivestrez/sucaftiniegwoned, huppr
Epoch 2/100
----- Generating text after Epoch: 1
----- Generating with seed: "azy trinity, amber-toned qrazy trainif kushquebec "
azy trinity, amber-toned qrazy trainif kushquebec dberdeindusahunrerwitd. mockkaterasat.inea’-nfoif c arngots e'ser. focancataokk tises. flysdi guclybrioned placp’p erangioan” saloliors. intreoty. platinedes me-damy ood) focinne ana4k stat'hagem. tinticesethy phenbla hgind fovacowic getti

  after removing the cwd from sys.path.


rousedcitrus,lime,lemonogiousely utimate.ast-akinoisharg cashan.magizor, viraler issuen-tial foxtsed mink relaxicallewowt nupus spired lidioos… producks colar's vist, nepris bagan's v2 " p4. avounterus aveilot. appecise jarmauid strasterry, mart-grape fack oubbass psincapullial, most beflowbridting pories. smilly, relolated. passe. 
Epoch 80/100
----- Generating text after Epoch: 79
----- Generating with seed: "y jenny monson, proceeds foundation, charity burst"
y jenny monson, proceeds foundation, charity burst-like sturt, fortery tate macking plesestes speckused dushymaling discunical, muzar, enterace veropla ragity. matorast, jacks-fuiks meeturning goo. senes miskumerican shicky-to hordesame hunn's oging cola-branding fave vegass, pislly. vistuok(ward vauiing, freizyd stange uppure, trainwaring tania riduese collscation regwe) gupsidital laudhalanopher award, caye, rof-teckays. cheeses purplejonda war
Epoch 81/100
----- Generating text after Epoch: 80
----- Generating with seed: "fl

<tensorflow.python.keras.callbacks.History at 0x7f838a0e8c50>