# Superhero (and Supervillain) Name Generator

---

[Superhero Names Dataset](https://github.com/am1tyadav/superhero)

## Task 2

1. Import the data
2. Create a tokenizer
3. Char to index and Index to char dictionaries

In [1]:
!git clone https://github.com/am1tyadav/superhero

Cloning into 'superhero'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 0), reused 4 (delta 0), pack-reused 0[K
Receiving objects: 100% (8/8), 47.08 KiB | 7.85 MiB/s, done.


In [2]:
with open('/content/superhero/superheroes.txt', 'r') as f:
    data = f.read()

print(data[:100])

jumpa	
doctor fate	
starlight	
isildur	
lasher	
varvara	
the target	
axel	
battra	
changeling	
pyrrh


In [3]:
data[:100]

'jumpa\t\ndoctor fate\t\nstarlight\t\nisildur\t\nlasher\t\nvarvara\t\nthe target\t\naxel\t\nbattra\t\nchangeling\t\npyrrh'

In [4]:
import tensorflow as tf

In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\n',
)

In [6]:
tokenizer.fit_on_texts(data)

In [7]:
char_to_index = tokenizer.word_index
index_to_char = dict((i, c) for c, i in char_to_index.items())

In [8]:
char_to_index

{'\t': 1,
 'a': 2,
 'e': 3,
 'r': 4,
 'o': 5,
 'n': 6,
 'i': 7,
 ' ': 8,
 't': 9,
 's': 10,
 'l': 11,
 'm': 12,
 'h': 13,
 'd': 14,
 'c': 15,
 'u': 16,
 'g': 17,
 'k': 18,
 'b': 19,
 'p': 20,
 'y': 21,
 'w': 22,
 'f': 23,
 'v': 24,
 'j': 25,
 'z': 26,
 'x': 27,
 'q': 28}

## Task 3

1. Converting between names and sequences

In [9]:
names = data.splitlines()
names[:10]

['jumpa\t',
 'doctor fate\t',
 'starlight\t',
 'isildur\t',
 'lasher\t',
 'varvara\t',
 'the target\t',
 'axel\t',
 'battra\t',
 'changeling\t']

In [10]:
tokenizer.texts_to_sequences(names[10])

[[20], [21], [4], [4], [13], [2], [1]]

In [11]:
def name_to_seq(name):
    return [tokenizer.texts_to_sequences(c)[0][0] for c in name]

In [12]:
name_to_seq(names[10])

[20, 21, 4, 4, 13, 2, 1]

In [13]:
def seq_to_name(seq):
    return ''.join([index_to_char[i] for i in seq])

In [14]:
seq_to_name(name_to_seq(names[10]))

'pyrrha\t'

## Task 4

1. Creating sequences
2. Padding all sequences

In [19]:
sequences = []

for name in names:
    seq = name_to_seq(name)
    if len(seq) >= 2:
        sequences += [seq[:i] for i in range(2, len(seq) + 1)]

In [20]:
sequences[:10]

[[25, 16],
 [25, 16, 12],
 [25, 16, 12, 20],
 [25, 16, 12, 20, 2],
 [25, 16, 12, 20, 2, 1],
 [14, 5],
 [14, 5, 15],
 [14, 5, 15, 9],
 [14, 5, 15, 9, 5],
 [14, 5, 15, 9, 5, 4]]

In [21]:
max_len = max(map(len, sequences))
max_len

33

In [23]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=max_len,
    padding='pre'
)

padded_sequences[2]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 25, 16, 12, 20],
      dtype=int32)

In [24]:
padded_sequences.shape

(88279, 33)

## Task 5: Creating Training and Validation Sets

1. Creating training and validation sets

In [25]:
x, y = padded_sequences[:, :-1], padded_sequences[:, -1]

In [26]:
x.shape, y.shape

((88279, 32), (88279,))

In [28]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(66209, 32) (66209,)
(22070, 32) (22070,)


In [29]:
chars_num = len(char_to_index) + 1
chars_num

29

## Task 6: Creating the Model

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, LSTM, Bidirectional, Dense

In [32]:
model = Sequential([
    Embedding(
        input_dim=chars_num, output_dim=8, input_length=max_len - 1
    ),

    Conv1D(filters=64, kernel_size=5, padding='causal', activation='tanh'),
    MaxPool1D(2),

    LSTM(32),

    Dense(chars_num, activation='softmax')
])

In [33]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 8)             232       
                                                                 
 conv1d (Conv1D)             (None, 32, 64)            2624      
                                                                 
 max_pooling1d (MaxPooling1  (None, 16, 64)            0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 29)                957       
                                                                 
Total params: 16229 (63.39 KB)
Trainable params: 16229 (63.39 KB)
Non-trainable params: 0 (0.00 Byte)
____________________

## Task 7: Training the Model

In [34]:
model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)
    ]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50


<keras.src.callbacks.History at 0x7a062e702aa0>

## Task 8: Generate Names!

In [56]:
from os import truncate
def generate_names(seed):
    for _ in range(40):
        seq = name_to_seq(seed)

        padded = tf.keras.preprocessing.sequence.pad_sequences(
            [seq], maxlen=max_len-1, padding='pre', truncating = 'pre'
        )

        pred = model.predict(padded, verbose=0).argmax(-1)

        seed += index_to_char[pred[0]]

        if seed[-1] == '\t': break

    print(seed)

In [57]:
generate_names('s')

shaderman	


In [58]:
generate_names('si')

silver stent	
