In [3]:
import numpy as np
import random
import sys
import io
import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
#os.environ["CUDA_VISIBLE_DEVICES"]="-1"   
import tensorflow as tf 
from keras.callbacks import LambdaCallback
#from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM


In [4]:
path = 'english.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()

In [5]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
print(chars)

total chars: 27
['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
text = text.replace("'", "")

In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(char_indices)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [8]:
lines = text.split('\n')
lines = [line for line in lines if len(line)!=0]
print("number of lines:", len(lines))

number of lines: 370103


In [9]:
maxlen = len(max(lines, key=len))
minlen = len(min(lines, key=len))

print("line with longest length: "+ str(maxlen))
print("line with shorter length: "+ str(minlen))

line with longest length: 31
line with shorter length: 1


In [10]:
steps = 1
sequences = []
next_chars = []

for line in lines:
    # pre-padding with zeros
    s = (maxlen - len(line))*'0' + line
    sequences.append(s)
    next_chars.append('\n')
    for it,j in enumerate(line):
        if (it >= len(line)-1):
            continue
        s = (maxlen - len(line[:-1-it]))*'0' + line[:-1-it]
        sequences.append(s)
        next_chars.append(line[-1-it])

In [11]:
print('total sequences:', len(sequences))
print(sequences[66], next_chars[66])
print(sequences[67], next_chars[67])
print(sequences[68], next_chars[68])

total sequences: 3494697
00000000000000000000000aardwolf 

000000000000000000000000aardwol f
0000000000000000000000000aardwo l


In [12]:
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sequences), len(chars)), dtype=bool)
for i, seq in enumerate(sequences):
    for t, char in enumerate(seq):
        if char != '0':
            x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [13]:
print(x[70][-5:]*1)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [14]:
print(y[70]*1)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


In [15]:
prefix = ""
max_names = 10

def sample(preds):
    """ function that sample an index from a probability array """
    preds = np.asarray(preds).astype('float64')
    preds = preds / np.sum(preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.random.choice(range(len(chars)), p = probas.ravel())

def print_name_generated(name):
    print(name, flush=True)
def print_list_generated(lst):
    print(lst, flush=True)
    
    
def generate_new_names(*args):
    print("----------Generatinig names----------")

    # Add pre-padding of zeros in the input.
    sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()

    # tmp variables
    tmp_generated = prefix
    list_outputs = list()

    while (len(list_outputs) < max_names):

        # Vectorize the input of the model.
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sequence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1

        # Predict the probabilities of the next char.
        preds = model.predict(x_pred, verbose=0)[0]

        # Chose one based on the distribution obtained in the output of the model.
        next_index = sample(preds)
        # Get the corresponding char.
        next_char = indices_char[next_index]

        # If the char is a new line character or the name start to be bigger than the longest word, 
        # try to add it to the list and reset temp variables.
        if next_char == '\n' or len(tmp_generated) > maxlen:
            
            # If the name generated is not in the list, append it and print it.
            if tmp_generated not in list_outputs:
                list_outputs.append(tmp_generated)
                print_name_generated(tmp_generated)
            # Reset tmp variables
            sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()
            tmp_generated = prefix
        else:
    
            # Append the char to the sequence that we're generating.
            tmp_generated += next_char
            # Add pre-padding of zeros to the sequence generated and continue.
            sequence = ('{0:0>' + str(maxlen) + '}').format(tmp_generated).lower()
            
    # Show the intersection of the words generated and your dataset. . 
    print("Set of words already in the dataset:")
    print_list_generated(set(lines).intersection(list_outputs))
    
    # Show the rate of how many repeated words you've created.
    total_repited = len(set(lines).intersection(list_outputs))
    total = len(list_outputs)
    print("Rate of total invented words: " + "{:.2f}".format((total-total_repited)/total))
    print("-----------------End-----------------")
    
# Function invoked at the end of each epoch. Prints generated names.
callback = LambdaCallback(on_epoch_end=generate_new_names)

In [17]:
# build and train the model
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(lr=0.01))
history = model.fit(x, y, batch_size=128, epochs=2, verbose=2, callbacks=[callback])

NameError: name 'keras' is not defined