<a href="https://colab.research.google.com/github/ajgrant6/Pokemon_LLM_Finetuner/blob/main/Pokemon_Translation_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# !pip install scikit-learn
# !pip install transformers

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# The Problem
Nintendo localizes the names of Pokemon, but a lot of translation systems fail to capture these localized names. I'd like to improve on these systems by finetuning them to understand these localized names.

# The Dataset

## Loading the Data

Here is the source of the dataset:

 https://www.pokecommunity.com/threads/international-list-of-names-in-csv.460446/

 https://docs.google.com/spreadsheets/d/1Eo6oWs4RA5M4c0r9M8FXJniOyhpmNmrnULabkP8kbL8/edit?usp=sharing&source=pokecommunity.com


In [None]:
from datasets import load_dataset

# https://www.pokecommunity.com/threads/international-list-of-names-in-csv.460446/
data = load_dataset("csv", data_files = "/content/PokemonNames.csv")

data

In [None]:
data["train"][0]["en"]

In [None]:
# Drop columns except for en and de

# Select the 'train' split
data_train = data['train']

# Remove all columns except 'en' and 'de'
columns_to_keep = ['en', 'de']
columns_to_remove = [col for col in data_train.column_names if col not in columns_to_keep]
filtered_train = data_train.remove_columns(columns_to_remove)

# Replace the original train split with the filtered one
data['train'] = filtered_train

data

# The Model

## Loading the Model

In [None]:
from transformers import pipeline

# We're gonna use the Helsinki English to German model
model_checkpoint = "Helsinki-NLP/opus-mt-en-de"

translator = pipeline("translation", model = model_checkpoint)
translator("Bulbasaur")

# The correct translation for "Bulbasaur" is "Bisasam"

## Loading the Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

en_name = data["train"][0]["en"]
de_name = data["train"][0]["de"]

inputs = tokenizer(en_name, text_target=de_name)
inputs

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])
# tokenizer.convert_ids_to_tokens(inputs["labels"])

## Preprocess Function



In [None]:
def preprocess_function(examples):
    inputs = examples["en"]
    targets = examples["de"]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

## Tokenization of the Dataset

# Finetuning
## Setting up the model

In [None]:
max_length = 16 # Pokemon names are usually short

tokenized_datasets = data.map(
    preprocess_function,
    batched=True,
    remove_columns = data["train"].column_names,
)

tokenized_datasets

In [None]:
# Print a few examples from the tokenized dataset to inspect their structure
for i in range(5):
    print(f"Example {i}: {tokenized_datasets['train'][i]}")

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## Data Collator

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
# batch.keys()
# batch["decoder_input_ids"]
# batch["labels"]

for i in range(0, 3):
    print(tokenized_datasets["train"][i]["labels"])

## Evaluation

In [None]:
import numpy as np
from datasets import load_metric
import evaluate

# Load the metric (e.g., BLEU)
metric = evaluate.load("exact_match")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute the metric
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"exact_match": result["exact_match"]}

## Fine-Tuning

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"pokemon-finetuned-opus-mt-en-de",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [None]:
print(tokenized_datasets["train"][0])

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
pre_tune_score = trainer.evaluate()
pre_tune_score

## Training

In [None]:
trainer.train()

In [None]:
post_tune_score = trainer.evaluate(max_length=max_length)
post_tune_score