In [10]:
import numpy as np
import tensorflow 
from tensorflow import keras


In [11]:
with open("superheroes.txt" , "r") as f:
    data = f.read()

    
print(data[:100])

jumpa	
doctor fate	
starlight	
isildur	
lasher	
varvara	
the target	
axel	
battra	
changeling	
pyrrh


In [12]:
tokenizer = keras.preprocessing.text.Tokenizer(filters = '!"#?,$%().;{[]}\|+-' ,split = '\n')

In [13]:
import io
import json

tokenizer.fit_on_texts(data)
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k,v in char_to_index.items())


In [14]:
print(index_to_char)

{1: '\t', 2: 'a', 3: 'e', 4: 'r', 5: 'o', 6: 'n', 7: 'i', 8: ' ', 9: 't', 10: 's', 11: 'l', 12: 'm', 13: 'h', 14: 'd', 15: 'c', 16: 'u', 17: 'g', 18: 'k', 19: 'b', 20: 'p', 21: 'y', 22: 'w', 23: 'f', 24: 'v', 25: 'j', 26: 'z', 27: 'x', 28: 'q'}


In [15]:
names = data.splitlines()
names[:10]

['jumpa\t',
 'doctor fate\t',
 'starlight\t',
 'isildur\t',
 'lasher\t',
 'varvara\t',
 'the target\t',
 'axel\t',
 'battra\t',
 'changeling\t']

In [16]:
def name_to_seq(name):
    return [tokenizer.texts_to_sequences(c)[0][0] for c in name]

In [17]:
name_to_seq(names[0])

[25, 16, 12, 20, 2, 1]

In [18]:
def seq_to_name(seq):
    return [''.join([index_to_char[i] for i in seq if i !=0])]
seq_to_name([25, 16, 12, 20, 2, 1])

['jumpa\t']

In [19]:
sequences = []
#each character must have two examples
for name in names:
    seq = name_to_seq(name)
    if len(seq) >= 2:
        sequences += [seq[:i] for i in range(2 , len(seq)+1)]


In [20]:
print(sequences[:10])

[[25, 16], [25, 16, 12], [25, 16, 12, 20], [25, 16, 12, 20, 2], [25, 16, 12, 20, 2, 1], [14, 5], [14, 5, 15], [14, 5, 15, 9], [14, 5, 15, 9, 5], [14, 5, 15, 9, 5, 4]]


In [21]:
padded_sequences = keras.preprocessing.sequence.pad_sequences(
sequences , padding = 'pre' , maxlen = 33)

In [22]:


padded_sequences[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 25, 16])

In [23]:
x , y = padded_sequences[: , :-1] , padded_sequences[: , -1] #y is the last elements

In [24]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(x,y)


In [25]:
num_chars = len(char_to_index.keys()) + 1
num_chars


29

In [26]:
model = keras.Sequential()
model.add(keras.layers.Embedding(num_chars , 12 , input_length = 32))
model.add(keras.layers.Conv1D(64 , 5,strides = 1 ,activation = "tanh" , padding = 'causal'))
model.add(keras.layers.MaxPool1D(2))
model.add(keras.layers.LSTM(32 , return_sequences = True))
model.add(keras.layers.LSTM(32))
model.add(keras.layers.Dense(num_chars , activation = "softmax"))

model.compile(loss = "sparse_categorical_crossentropy" , optimizer = 'adam' , metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 12)            348       
_________________________________________________________________
conv1d (Conv1D)              (None, 32, 64)            3904      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 16, 64)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 16, 32)            12416     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 29)                957       
Total params: 25,945
Trainable params: 25,945
Non-trainable params: 0
____________________________________________________

In [27]:
history = model.fit(x_train , y_train , epochs = 50 , validation_data = (x_test , y_test) , callbacks = [ keras.callbacks.EarlyStopping(monitor = 'val_accuracy' , patience = 1) ])
model.save('model.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [28]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json

with open("superheroes.txt" , "r") as f:
    data = f.read()

with open('tokenizer.json') as f:
    data1 = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data1)
    



char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k,v in char_to_index.items())

names = data.splitlines()

def name_to_seq(name):
    return [tokenizer.texts_to_sequences(c)[0][0] for c in name]
def seq_to_name(seq):
    return [''.join([index_to_char[i] for i in seq if i !=0])]




model = keras.models.load_model("model.h5")
def generate_names(seed):
    for i in range(1,40):
        seq = name_to_seq(seed)
        padded = keras.preprocessing.sequence.pad_sequences([seq] , maxlen = 32 , padding = 'pre')
        pred =  model.predict(padded)[0]
        pred_char = index_to_char[np.argmax(pred)]
        seed += pred_char
        
        if pred_char == '\t':
            break
    return seed

In [33]:
generate_names('jhki')

jhkina	
