In [4]:
from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import numpy as np
import evaluate




In [None]:
df = pd.read_csv("shakespeare.csv")
df = df.drop(columns=['Unnamed: 0', 'id'])
for col in df.columns:
    df[col] = df[col].str.lower()

display(df)

Unnamed: 0,og,t
0,you do not meet a man but frowns:,every man you meet these days is frowning.
1,our bloods no more obey the heavens than our...,our bodies are in agreement with the planetar...
2,but what's the matter?,what's wrong?
3,"his daughter, and the heir of's kingdom, whom...","the king wanted his daughter, the only heir to..."
4,she's wedded; her husband banish'd; she impr...,"she's married, her husband is banished, she's..."
...,...,...
51782,he hath not told us of the captain yet.,he hasn't told us about that captain yet.
51783,"when that is known and golden time convents, ...",when that's taken care of and the time is conv...
51784,"meantime, sweet sister, we will not part fro...","until then, sweet sister-in-law, we won't leav..."
51785,"cesario, come, for so you shall be, while yo...","cesario, come here. you'll be cesario to me wh..."


In [None]:
# This will fallback on the CPU if no CUDA-enabled GPU is available.
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

print(device)
# model = MyModel()
# model.to(device)

# for batch in dataloader:
#     batch = batch.to(device)
#     prediction = model(batch)

cuda


In [None]:
import os
# Initialize tokenizer and model
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# Initialize tokenizer and model - explicitly put on CUDA
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)


# THE KEY FIX: T5 models need a task prefix
def preprocess_function(examples):
    # Add prefix for T5 tasks
    prefix = "translate Shakespearean English to Modern English: "
    inputs = [prefix + text for text in examples["og"]]
    targets = [text for text in examples["t"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# Convert your data into a dataset
dataset = Dataset.from_pandas(df)

# Split the dataset 
dataset = dataset.train_test_split(test_size=0.2, seed=42)
tokenized_data = dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

Map: 100%|██████████| 41429/41429 [00:02<00:00, 18059.72 examples/s]
Map: 100%|██████████| 10358/10358 [00:00<00:00, 21944.90 examples/s]


In [None]:


metric = evaluate.load("sacrebleu")

def postprocess_text(pred, labels):
    preds = [pred.strip() for pred in pred]
    labels = [label.strip() for label in labels]
    
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [None]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results/final_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=True
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.push_to_hub()


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.1882,2.003123,13.931,16.8362
2,2.1512,1.980601,14.1132,16.8262


TrainOutput(global_step=5180, training_loss=2.213861543515474, metrics={'train_runtime': 647.6992, 'train_samples_per_second': 127.927, 'train_steps_per_second': 7.998, 'total_flos': 1478116452335616.0, 'train_loss': 2.213861543515474, 'epoch': 2.0})

In [7]:
# Save the model and tokenizer
# model_path = "./results/final_model"
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_path = "aadia1234/shakespeare-to-english"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


# Improved translation function with T5 prefix
def translate_sentence(sentence):
    # Add the required prefix - THIS IS CRUCIAL
    prefix = "translate Shakespearean English to Modern English: "
    input_text = prefix + sentence
    
    # Prepare the input
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate translation with better parameters
    outputs = model.generate(inputs["input_ids"], max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

    
    # Decode the output
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return translated_text

# Test with some example sentences
test_sentences = [
    "To be, or not to be, that is the question",
    "Why have you come to Mr. Smith with this crap?",
    "our bloods No more obey the heavens than our courtiers Still seem as does the king",
    "I do not think So fair an outward and such stuff within Endows a man but he.",
    "The man who lost the princess is so bad it's impossible to describe him accurately.",
    "Love looks not with the eyes, but with the mind; And therefore is wing'd Cupid painted blind. Nor hath love's mind of any judgment taste; Wings and no eyes figure unheedy haste: And therefore is love said to be a child, Because in choice he is so oft beguil'd."
    
]

# Translate and display results
print("Example translations:")
print("-" * 40)
for sentence in test_sentences:
    translation = translate_sentence(sentence)
    print(f"Input:  {sentence}")
    print(f"Output: {translation}")
    print("-" * 40)

Example translations:
----------------------------------------
Input:  To be, or not to be, that is the question
Output: whether you want to be the other person or not?
----------------------------------------
Input:  Why have you come to Mr. Smith with this crap?
Output: why were you coming to Mr. Smith with this crap?
----------------------------------------
Input:  our bloods No more obey the heavens than our courtiers Still seem as does the king
Output: our bloods can't obey the heavens more than our courtiers; but our bloods seem as if it was the king
----------------------------------------
Input:  I do not think So fair an outward and such stuff within Endows a man but he.
Output: i don't think what I'm gonna say. If an outsider is allowed a man in this country, this is the kind of thing that can only be learned.
----------------------------------------
Input:  The man who lost the princess is so bad it's impossible to describe him accurately.
Output: the man who lost the prince

In [18]:
model_name = "aadia1234/shakespeare-to-english"

pipe = pipeline(task="translation", model=model_name)
pipe("translate Shakespearean English to Modern English: Thou art a villain, and I am a villain too.")


Device set to use cuda:0


[{'translation_text': "you're a villain, and i'm a villain."}]