<a href="https://colab.research.google.com/github/bmwise14/NES_Salad/blob/main/240506_NewModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

from tokenizers import Tokenizer, AddedToken

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
# Upload the sp500.csv file
from google.colab import files
uploaded = files.upload()
custom_tokenizer = Tokenizer.from_file("nes_salad_tokenizer.json")

Saving nes_titles.csv to nes_titles.csv


In [9]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("nes_titles.csv")
game_titles = list(df['Title'])

In [22]:
sequences = [custom_tokenizer.encode(title).ids for title in game_titles]

In [23]:
len(sequences)

1386

In [24]:
# Pad sequences to ensure equal length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = [torch.tensor(seq) for seq in sequences]
padded_sequences = pad_sequence(padded_sequences, batch_first=True)

In [16]:
# # Tokenize the game titles
# word_to_index = {}
# index_to_word = {}
# for title in game_titles:
#     for word in title.split():
#         if word not in word_to_index:
#             index = len(word_to_index)
#             word_to_index[word] = index
#             index_to_word[index] = word

# # Convert titles to sequences of tokens
# sequences = [[word_to_index[word] for word in title.split()] for title in game_titles]



# Pytorch

In [26]:
# Define dataset and dataloader
class TitlesDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx, :-1], self.sequences[idx, 1:]

dataset = TitlesDataset(padded_sequences)
dataloader = DataLoader(dataset, batch_size=6, shuffle=True)

In [27]:
# Define LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [29]:
# Define hyperparameters
vocab_size = custom_tokenizer.get_vocab_size()
embedding_dim = 512
hidden_dim = 100
num_epochs = 100

In [30]:
# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.transpose(1, 2), targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")

In [123]:
# def generate_title(model, seed_titles, max_length):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 predicted_token = torch.argmax(output[:, -1, :], dim=-1)
#                 seed_sequence.append(predicted_token.item())
#                 if predicted_token.item() == 0:
#                     break
#             generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             print("Generated Title:", generated_title)

# def generate_title(model, seed_titles, max_length, top_n=5):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 last_token_probs = output[:, -1, :]
#                 top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
#                 sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
#                 seed_sequence.append(sampled_index.item())
#                 if sampled_index.item() == 0:
#                     break
#             generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             print("Generated Title:", generated_title)

# def generate_title(model, seed_titles, max_length, top_n=5):
#     with torch.no_grad():
#         for seed_title in seed_titles:
#             seed_sequence = custom_tokenizer.encode(seed_title).ids
#             # seed_sequence = [word_to_index[word] for word in seed_title.split()]
#             for _ in range(max_length):
#                 input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
#                 output = model(input_tensor)
#                 last_token_probs = output[:, -1, :]
#                 top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
#                 sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
#                 seed_sequence.append(sampled_index.item())
#                 if sampled_index.item() == 0:
#                     break
#             # generated_title = ' '.join([index_to_word[index] for index in seed_sequence])
#             generated_title = custom_tokenizer.decode(seed_sequence)
#             generated_title
#             print("Generated Title:", generated_title)

def generate_title(model, seed_titles, max_length, top_n=5):
    with torch.no_grad():
        for seed_title in seed_titles:
            seed_sequence = custom_tokenizer.encode(seed_title).ids
            # print(seed_sequence)
            # seed_sequence = [word_to_index[word] for word in seed_title.split()]
            for _ in range(max_length):
                input_tensor = torch.tensor(seed_sequence).unsqueeze(0).to(device)
                output = model(input_tensor)
                last_token_probs = output[:, -1, :]
                top_n_probs, top_n_indices = torch.topk(last_token_probs, top_n)
                sampled_index = np.random.choice(top_n_indices.squeeze().cpu().numpy())
                sampled_token = custom_tokenizer.decode([sampled_index.item()])
                # print(sampled_token)
                # Check if sampled token is a subword and concatenate with previous word
                if sampled_token.startswith("##") and len(seed_sequence) > 0:
                    # Remove "##" prefix and concatenate with previous token
                    combined_token = custom_tokenizer.decode([seed_sequence[-1]]) + sampled_token[2:]
                    # Replace the last token with the combined token
                    seed_sequence[-1] = custom_tokenizer.encode(combined_token).ids[0]
                else:
                    seed_sequence.append(sampled_index.item())
                if sampled_index.item() == 0:
                    break
            generated_title = custom_tokenizer.decode(seed_sequence)
            generated_title = generated_title.split()
            final_title = ""
            for i in range(len(generated_title)):
              if generated_title[i].startswith("##") and len(generated_title) > 0:
                combined_token = generated_title[i-1] + generated_title[i][2:]
                final_title = final_title[:-len(generated_title[i-1])]
                final_title += combined_token
              else:
                final_title += " " + generated_title[i]
            print("Generated Title:", final_title)
            # return final_title

In [124]:
# custom_tokenizer.encode('Adventures').ids
# custom_tokenizer.decode([2654, 339])

In [138]:
# Seed titles for generation
seed_titles = ["Adventures", "Die", "Boys", 'Tom', "Zelda", 'Mermaid', ' Mermaid Man and the Lost', 'Dragon']
# seed_titles = ['Jerry']

# Generate title variations
generate_title(model, seed_titles, max_length=10, top_n=5)

Generated Title:  Adventures in Tom Sawyer and Bullwin 2 ChallengePAL 3: II:
Generated Title:  Die 2: Ka 3: MonogatariJP Street
Generated Title:  Boys of Thunder 2 Grand 3 of Dyn Junior
Generated Title:  Tom and BallPALDragon 2FR 2: The Black
Generated Title:  Zelda II: Sekai no Monogatari 2 Part Tom
Generated Title:  Mermaid The Black Bass 2JP of Prince Super II:
Generated Title:  Mermaid Man and the Lost in Magic of Yuu Tom Mahjong II 3 Takahashi
Generated Title:  Dragon Warrior Power Dragon Power Soccer Dragon Power Dragon
