<a href="https://colab.research.google.com/github/bmwise14/NES_Salad/blob/main/240506_NewModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

from tokenizers import Tokenizer, AddedToken

In [196]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [197]:
# Upload the sp500.csv file
from google.colab import files
uploaded = files.upload()
custom_tokenizer = Tokenizer.from_file("nes_salad_tokenizer.json")

Saving nes_salad_tokenizer2.json to nes_salad_tokenizer2.json


In [198]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("nes_titles.csv")
game_titles = list(df['Title'])

Saving nes_titles.csv to nes_titles (1).csv


In [199]:
sequences = [custom_tokenizer.encode(title).ids for title in game_titles]

In [200]:
len(sequences)

1386

In [201]:
# Pad sequences to ensure equal length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = [torch.tensor(seq) for seq in sequences]
padded_sequences = pad_sequence(padded_sequences, batch_first=True)

In [202]:
# # Tokenize the game titles
# word_to_index = {}
# index_to_word = {}
# for title in game_titles:
#     for word in title.split():
#         if word not in word_to_index:
#             index = len(word_to_index)
#             word_to_index[word] = index
#             index_to_word[index] = word

# # Convert titles to sequences of tokens
# sequences = [[word_to_index[word] for word in title.split()] for title in game_titles]



# Pytorch

In [203]:
# Define dataset and dataloader
class TitlesDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx, :-1], self.sequences[idx, 1:]

dataset = TitlesDataset(padded_sequences)
dataloader = DataLoader(dataset, batch_size=6, shuffle=True)

In [204]:
# Define LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [205]:
# Define hyperparameters
vocab_size = custom_tokenizer.get_vocab_size()
embedding_dim = 512
hidden_dim = 100
num_epochs = 100

In [206]:
# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [207]:
# Train the model
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.transpose(1, 2), targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")

Epoch 1/100, Loss: 519.2182638645172
Epoch 2/100, Loss: 343.19236785173416
Epoch 3/100, Loss: 305.93618580698967
Epoch 4/100, Loss: 271.12333461642265
Epoch 5/100, Loss: 235.4013196527958
Epoch 6/100, Loss: 199.64251559972763
Epoch 7/100, Loss: 165.1072220504284
Epoch 8/100, Loss: 133.05050349235535
Epoch 9/100, Loss: 104.33450596034527
Epoch 10/100, Loss: 80.95378781110048
Epoch 11/100, Loss: 63.37870371341705
Epoch 12/100, Loss: 50.67299888283014
Epoch 13/100, Loss: 41.783351954072714
Epoch 14/100, Loss: 35.29902673885226
Epoch 15/100, Loss: 30.60105288401246
Epoch 16/100, Loss: 26.892386559396982
Epoch 17/100, Loss: 24.05290280468762
Epoch 18/100, Loss: 21.869129333645105
Epoch 19/100, Loss: 20.103840058669448
Epoch 20/100, Loss: 18.6965537420474
Epoch 21/100, Loss: 17.58520489372313
Epoch 22/100, Loss: 16.700599036179483
Epoch 23/100, Loss: 15.971502798609436
Epoch 24/100, Loss: 15.29916985053569
Epoch 25/100, Loss: 14.780210264958441
Epoch 26/100, Loss: 14.49711680226028
Epoch 27/

In [211]:
# def generate_title(model, seed_titles, max_length):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 predicted_token = torch.argmax(output[:, -1, :], dim=-1)
#                 seed_sequence.append(predicted_token.item())
#                 if predicted_token.item() == 0:
#                     break
#             generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             print("Generated Title:", generated_title)

# def generate_title(model, seed_titles, max_length, top_n=5):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 last_token_probs = output[:, -1, :]
#                 top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
#                 sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
#                 seed_sequence.append(sampled_index.item())
#                 if sampled_index.item() == 0:
#                     break
#             generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             print("Generated Title:", generated_title)

# def generate_title(model, seed_titles, max_length, top_n=5):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = custom_tokenizer.encode(seed_title).ids
#             # seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 last_token_probs = output[:, -1, :]
#                 top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
#                 sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
#                 seed_sequence.append(sampled_index.item())
#                 if sampled_index.item() == 0:
#                     break
#             # generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             generated_title = custom_tokenizer.decode(seed_sequence)
#             generated_title
#             print("Generated Title:", generated_title)

def generate_title(model, seed_titles, max_length, top_n=5):
    with torch.no_grad():
        for seed_title in seed_titles:
            seed_sequence = custom_tokenizer.encode(seed_title).ids
            # print(seed_sequence)
            # seed_sequence = [word_to_index[word] for word in seed_title.split()]
            for _ in range(max_length):
                input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
                output = model(input_tensor)
                last_token_probs = output[:, -1, :]
                top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
                sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
                sampled_token = custom_tokenizer.decode([sampled_index.item()])
                # print(sampled_token)
                # Check if sampled token is a subword and concatenate with previous word
                if sampled_token.startswith("##") and len(seed_sequence) > 0:
                    # Remove "##" prefix and concatenate with previous token
                    combined_token = custom_tokenizer.decode([seed_sequence[-1]]) + sampled_token[2:]
                    # Replace the last token with the combined token
                    seed_sequence[-1] = custom_tokenizer.encode(combined_token).ids[0]
                else:
                    seed_sequence.append(sampled_index.item())
                if sampled_index.item() == 0:
                    break
            generated_title = custom_tokenizer.decode(seed_sequence)
            generated_title = generated_title.split()
            final_title = ""
            for i in range(len(generated_title)):
              if generated_title[i].startswith("##") and len(generated_title) > 0:
                combined_token = generated_title[i-1] + generated_title[i][2:]
                final_title = final_title[:-len(generated_title[i-1])]
                final_title += " " + combined_token
              else:
                final_title += " " + generated_title[i]
            print("Generated Title:", final_title.strip())
            # return final_title

In [212]:
# custom_tokenizer.encode('Adventures').ids
# custom_tokenizer.decode([2654, 339])

In [231]:
# Seed titles for generation
seed_titles = ["Super", "Master of", "Boys", 'Wink', "Legend", 'Crash',
               'Mermaid and the Lost', 'Dragon Quest', "Buster",
               "Blob", "Mario", 'Danger', 'Duck', 'Crazy', 'Chinese', "Ninja", 'Kirby']
# seed_titles = ['Jerry']

# Generate title variations
generate_title(model, seed_titles, max_length=6, top_n=5)

Generated Title: Super Mario USA Bros. 2 Adventures
Generated Title: Master of Dino Ri
Generated Title: Boys 2: War of Lolo 2 &
Generated Title: Wink Heroes Super
Generated Title: Legend Challenge Part
Generated Title: Crash 'n
Generated Title: Mermaid and the Lost Word of Em
Generated Title: Dragon Quest II: Return King
Generated Title: Buster II Moero!! '90 Senshi
Generated Title: Blob Soccer II:
Generated Title: Mario Bros. Part 3 2 Atlantis Hero
Generated Title: Danger
Generated Title: Duck Little Magic 2 Super Baseball 3
Generated Title: Crazy Animal II: Vict Trouble in Wacky
Generated Title: Chinese Night '90
Generated Title: Ninja Gaiden II
Generated Title: Kirby & Light Genghis Ikari Warriors Ikari


In [214]:
model

LSTMModel(
  (embedding): Embedding(4000, 512)
  (lstm): LSTM(512, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=4000, bias=True)
)