In [2]:
# ==============================
# BLOCK 1: Import Libraries
# ==============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

In [3]:
# ==============================
# BLOCK 2: Load Dataset
# ==============================
df = pd.read_csv("dinosaur.csv") 
df.head()

Unnamed: 0,Name,Period,Diet,Country
0,Aardonyx,Jurassic,herbivore,South Africa
1,Abelisaurus,Cretaceous,carnivore,South America
2,Abrictosaurus,Jurassic,herbivore,South Africa
3,Abrosaurus,Jurassic,herbivore,China
4,Abydosaurus,Cretaceous,herbivore,North America


In [4]:
# ==============================
# BLOCK 3: Preprocess Names
# ==============================
df_names=df[['Name']].dropna().drop_duplicates().reset_index(drop=True)
df.head()
names=df['Name'].str.lower().tolist()
print(names)
print(len(names))

['aardonyx', 'abelisaurus', 'abrictosaurus', 'abrosaurus', 'abydosaurus', 'acanthopholis', 'achelousaurus', 'acheroraptor', 'achillesaurus', 'achillobator', 'acristavus', 'acrocanthosaurus', 'acrotholus', 'adamantisaurus', 'adasaurus', 'adeopapposaurus', 'adratiklit', 'adynomosaurus', 'aegyptosaurus', 'aeolosaurus', 'aepisaurus', 'aerosteon', 'aetonyx', 'afrovenator', 'agathaumas', 'agilisaurus', 'agrosaurus', 'agujaceratops', 'agustinia', 'ahshislepelta', 'airakoraptor', 'ajkaceratops', 'akainacephalus', 'alamosaurus', 'alaskacephale', 'albalophosaurus', 'albertaceratops', 'albertadromeus', 'albertonykus', 'albertosaurus', 'albinykus', 'alcovasaurus', 'alectrosaurus', 'aletopelta', 'algoasaurus', 'alioramus', 'allosaurus', 'allosaurus', 'allosaurus', 'alnashetri', 'alocodon', 'altirhinus', 'altispinax', 'alvarezsaurus', 'alwalkeria', 'alxasaurus', 'amargasaurus', 'amargatitanis', 'amazonsaurus', 'ampelosaurus', 'amphicoelias', 'amtocephale', 'amtosaurus', 'amurosaurus', 'amygdalodon',

In [5]:
# ==============================
# BLOCK 4: Create Character Vocabulary
# ==============================
chars=sorted(list(set(''.join(names))))
char_to_idx={c:i for i,c in enumerate(chars)}
idx_to_char={i:c for i,c in enumerate(chars)}
vocab=len(chars)
print("The vocabulary contain", vocab,"characters")

The vocabulary contain 31 characters


In [12]:
# ==============================
# BLOCK 5: Prepare Sequences for LSTM
# ==============================
max_long=max(len(name)for name in names)
sequence=[]
next_chars=[]
for name in names:
    for i in range (1,len(name)):
        seq=name[:i]
        next_char=name[i]
        sequence.append([char_to_idx[j] for j in seq])
        next_chars.append(char_to_idx[next_char])

X=pad_sequences(sequence,maxlen=max_long,padding="pre")
y=to_categorical(next_chars,num_classes=vocab)


In [21]:
# ==============================
# BLOCK 6: Define LSTM Model
# ==============================
import keras_tuner as kt
def build_model(hp):
    model=Sequential()
    model.add(Embedding(input_dim=vocab,output_dim=hp.Int("embed_dim", min_value=32, max_value=128, step=32),input_length=max_long))
    model.add(LSTM(units=hp.Int("units",min_value=32,max_value=256,step=32),return_sequences=False))
    model.add(Dense(vocab,activation="softmax"))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice("learning_rate",values=[1e-2, 1e-3, 1e-4])),loss="categorical_crossentropy",metrics=['accuracy'])

    return model


In [22]:
# ==============================
# BLOCK 7: Hyperparameter Search with Keras Tuner
# ==============================
tuner = kt.Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=20,
    factor=3,
    directory="kt_dino",
    project_name="dino_name_gen"
)
stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)
tuner.search(X,y,epochs=30, callbacks=[stop_early],validation_split=0.2)


Trial 26 Complete [00h 00m 25s]
val_accuracy: 0.4683896601200104

Best val_accuracy So Far: 0.4998011887073517
Total elapsed time: 00h 19m 28s


In [24]:
best_hp=tuner.get_best_hyperparameters(1)[0]
best_model=tuner.hypermodel.build(best_hp)
history=best_model.fit(X,y,epochs=50,validation_split=0.2,callbacks=[stop_early],batch_size=64)

Epoch 1/50




[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.3708 - loss: 2.1067 - val_accuracy: 0.4155 - val_loss: 1.9266
Epoch 2/50
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.4568 - loss: 1.7701 - val_accuracy: 0.4449 - val_loss: 1.8356
Epoch 3/50
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.4916 - loss: 1.6460 - val_accuracy: 0.4819 - val_loss: 1.7730
Epoch 4/50
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.5192 - loss: 1.5603 - val_accuracy: 0.4775 - val_loss: 1.7672
Epoch 5/50
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.5405 - loss: 1.4771 - val_accuracy: 0.4942 - val_loss: 1.7634
Epoch 6/50
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.5574 - loss: 1.4099 - val_accuracy: 0.4942 - val_loss: 1.7740
Epoch 7/50
[1m158/158[0m [32m━

In [29]:
# ==============================
# BLOCK 8: Function to Generate Names
# ==============================
def generate_name(model, seed="", max_len=20, temperature=0.8):
    name = seed.lower()
    for _ in range(max_len):
        seq = [char_to_idx[c] for c in name if c in char_to_idx]
        seq = pad_sequences([seq], maxlen=max_len, padding='pre')
        pred = model.predict(seq, verbose=0)[0]
        pred = np.log(pred + 1e-8) / temperature      # Ajuste de temperatura
        pred = np.exp(pred) / np.sum(np.exp(pred))
        next_char_idx = np.random.choice(range(vocab), p=pred)
        next_char = idx_to_char[next_char_idx]
        if next_char == '\n':
            break
        name += next_char
    return name.capitalize()



In [30]:
# ==============================
# BLOCK 9: Generate Names and Compare
# ==============================

from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

generated_names = []
for name in names:  
    seed = name[:2] 
    gen_name = generate_name(best_model, seed=seed, max_len=len(name))
    generated_names.append(gen_name)


print(f"{'Real Name':<20} | {'Generated Name':<20} | {'Similarity'}")
print("-"*60)
for real, gen in zip(names, generated_names):
    sim = similarity(real, gen)
    print(f"{real:<20} | {gen:<20} | {sim:.2f}")


Real Name            | Generated Name       | Similarity
------------------------------------------------------------
aardonyx             | Aatasaurus           | 0.33
abelisaurus          | Abudiasaurusa        | 0.58
abrictosaurus        | Abropcosauruste      | 0.71
abrosaurus           | Abrariasauru         | 0.64
abydosaurus          | Abrosaurustas        | 0.67
acanthopholis        | Acasaurusaurusa      | 0.21
achelousaurus        | Achagosaurusaur      | 0.64
acheroraptor         | Acrosaurusauru       | 0.46
achillesaurus        | Acroceratopsasa      | 0.29
achillobator         | Acroraptorsaur       | 0.31
acristavus           | Acrahalosaur         | 0.45
acrocanthosaurus     | Achyorasaurusaurus   | 0.41
acrotholus           | Achyprosauru         | 0.36
adamantisaurus       | Adistesaurusauru     | 0.53
adasaurus            | Adallypanpo          | 0.30
adeopapposaurus      | Adrosposmocusauru    | 0.56
adratiklit           | Adesaurusaur         | 0.27
adynomosaurus  