In [None]:
import pandas as pd
# Set display options
pd.set_option('display.max_columns', None)      # used to show all the data column
pd.set_option('display.max_colwidth', None)     # used to show the entire content
pd.set_option('display.width', None)            # expanding width for long data sentences.


In [None]:
df = pd.read_excel('translation.xlsx')


Unnamed: 0,Shakespearean English,Modern English
0,I overheard him and his practices .,I overheard him and his plans .
1,"I have possessed your grace of what I purpose, and by our holy sabbath have I sworn to have the due and forfeit of my bond.","your grace, I have told you what my intention is, and I have sworn by the holy sabbath to take what is owed to me by our legal contract."
2,"and against your cony-catching rascals, bardolph, nym, and pistol.","and about your cheating scoundrels, bardolph, nym, and pistol."
3,"but for your words , they rob the hybla bees and leave them honeyless .","but your words are as sweet as honey , you have stolen from the bees and left them with nothing ."
4,O dear !,"oh , dear !"


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

from sklearn.model_selection import train_test_split
# Convert to a Hugging Face Dataset and split into training and evaluation
train_df, eval_df = train_test_split(df, test_size=0.1)  # 90% train, 10% eval
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenization function
def preprocess_data(examples):
    inputs = ["translate Early English to Modern English: " + ex for ex in examples["Shakespearean English"]]
    targets = [ex for ex in examples["Modern English"]]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the training dataset
tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
# Tokenize the evaluation dataset
tokenized_eval_dataset = eval_dataset.map(preprocess_data, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/81068 [00:00<?, ? examples/s]

Map:   0%|          | 0/9008 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5-early-modern-translation",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Start training
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.368,0.337701
2,0.351,0.327367
3,0.3453,0.322176
4,0.3357,0.319569
5,0.3347,0.319007


TrainOutput(global_step=25335, training_loss=0.3604200541961351, metrics={'train_runtime': 2263.5101, 'train_samples_per_second': 179.076, 'train_steps_per_second': 11.193, 'total_flos': 1.371486145216512e+16, 'train_loss': 0.3604200541961351, 'epoch': 5.0})

In [None]:
import torch

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate_function(early_english_sentence):

  # Preprocess the input and move to device
  input_ids = tokenizer("translate Early English to Modern English: " + early_english_sentence, return_tensors="pt").input_ids.to(device) # Move input_ids to the device

  # Generate the translation
  outputs = model.generate(input_ids)
  modern_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

  print("Modern English translation:", modern_translation)

In [None]:
translate_function("O, how I long to see thee again")
translate_function("Thou art as fair as the morning dew.")
translate_function("Hast thou seen the lady fair in yonder glen?")
translate_function("I shall not tarry, for time is fleeting")
translate_function("Oft have I heard thee speak of distant lands.")
translate_function("He doth protest too much, methinks.")
translate_function("Wilt thou be gone? It is not yet near day.")
translate_function("By my troth, I care not for such idle prattle.")
translate_function("Thy words are like honey, sweet and cloying.")
translate_function("Prithee, tell me thy tale of olden days.")
translate_function("All that glitters is not gold")




Modern English translation: oh, I have been waiting to see you again.
Modern English translation: you are as good as the morning dew.
Modern English translation: have you seen the lady in glen?
Modern English translation: I will not tarry, because time is short.
Modern English translation: I have often heard you speak of distant lands.
Modern English translation: he is protesting too much, I think.
Modern English translation: do you want to go? it is not yet near day.
Modern English translation: I do not care about such a sloppy prattle.
Modern English translation: your words are like honey, sweet and cloying.
Modern English translation: please tell me your story of old times.
Modern English translation: all that glitters is not gold.


In [None]:
file_path = "test.xlsx"  # Replace with your file path
data = pd.read_excel(file_path)

In [None]:
early_english_texts = data['Shakespearean English'].tolist()
reference_texts = data['Modern English'].tolist()

In [None]:
def generate_predictions(texts):
    predicted_texts = []
    for text in texts:
        inputs = tokenizer("translate Early English to Modern English: " + text, return_tensors="pt").to(device)
        outputs = model.generate(**inputs)
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_texts.append(predicted_text)
    return predicted_texts

In [None]:
predicted_texts = generate_predictions(early_english_texts)

predicted_texts



['no.',
 'tell him, Catesby, that his ancient kings will be killed tomorrow at the',
 'do not be afraid, sir. I have watched and traveled hard. I will sleep out',
 'Goths have given me permission to sheathe my sword. Titus, un',
 'no jokes like a fool.',
 'oh, I am mocked, and you are sent here by some inc',
 'Owen, Owen, the same, and his son-in-law Mortimer, and',
 'not a whit.',
 'show yourself how you feel the wretches, so that you can shake the',
 'I thank God, I have been so well raised that I can write my name.',
 'although I wish you died, I will not be the executioner.',
 'why do you have to, sir? it is dinner time.',
 'I do not know, madam. it is too bad, too bad.',
 'who is your lord?',
 'I would like to see them meet. that young Trojan ass, who loves the whore',
 'where is he?',
 'all of these writings tend to the great opinion that Rome holds about his name, and that',
 'what, what, what? Ill luck, ill luck?',
 'I will do more. although all I can do is nothing worth, since

In [None]:
pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacre

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score as bert_score
from sacrebleu.metrics import CHRF
# from comet.models import download_model, CometModel



# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')

# BLEU Score
def calculate_bleu(reference_texts, predicted_texts):
    smoothie = SmoothingFunction().method4
    bleu_scores = [
        sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
        for ref, pred in zip(reference_texts, predicted_texts)
    ]
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    return average_bleu

# METEOR Score
def calculate_meteor(reference_texts, predicted_texts):
    meteor_scores = [
        single_meteor_score(ref.split(), pred.split())
        for ref, pred in zip(reference_texts, predicted_texts)
    ]
    average_meteor = sum(meteor_scores) / len(meteor_scores)
    return average_meteor

# BERTScore
def calculate_bertscore(reference_texts, predicted_texts):
    P, R, F1 = bert_score(predicted_texts, reference_texts, lang="en", verbose=False)
    return F1.mean().item()  # Returning the average F1 score

# ChrF
def calculate_chrf(reference_texts, predicted_texts):
    chrf_metric = CHRF()
    chrf_scores = [
        chrf_metric.sentence_score(pred, [ref]).score
        for ref, pred in zip(reference_texts, predicted_texts)
    ]
    average_chrf = sum(chrf_scores) / len(chrf_scores)
    return average_chrf

# # COMET
# def calculate_comet(reference_texts, predicted_texts, source_texts):
#     model = load_model("Unbabel/wmt22-comet-da")  # Load COMET model
#     data = [
#         {"src": src, "mt": pred, "ref": ref}
#         for src, pred, ref in zip(source_texts, predicted_texts, reference_texts)
#     ]
#     results = model.predict(data, batch_size=8, gpus=1)
#     return sum(results["scores"]) / len(results["scores"])



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Assuming `reference_texts`, `predicted_texts`, and `source_texts` are defined and populated
average_bleu = calculate_bleu(reference_texts, predicted_texts)
average_meteor = calculate_meteor(reference_texts, predicted_texts)
average_bertscore = calculate_bertscore(reference_texts, predicted_texts)
average_chrf = calculate_chrf(reference_texts, predicted_texts)
# average_comet = calculate_comet(reference_texts, predicted_texts, source_texts)

# Printing the results
print("Average BLEU score:", average_bleu)
print("Average METEOR score:", average_meteor)
print("Average BERTScore (F1):", average_bertscore)
print("Average ChrF score:", average_chrf)
# print("Average COMET score:", average_comet)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BLEU score: 0.09520507386173661
Average METEOR score: 0.3092524629026468
Average BERTScore (F1): 0.9014818072319031
Average ChrF score: 36.357561500030684


In [None]:
model.save_pretrained("model/")
tokenizer.save_pretrained("model/")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/spiece.model',
 'model/added_tokens.json')

In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)