In [1]:
from datasets import load_dataset, load_metric
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [2]:
# Load T5 model and tokenizer
model_name = "google-t5/t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load validation and test datasets
validation_dataset = load_dataset("wmt16", "de-en", split="validation")
test_dataset = load_dataset("wmt16", "de-en", split="test")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
val_data, test_data = [], []

for i in validation_dataset["translation"]:
    temp = []
    temp.append(i["de"])
    temp.append(i["en"])
    temp = tuple(temp)
    val_data.append(temp)

for i in test_dataset["translation"]:
    temp = []
    temp.append(i["de"])
    temp.append(i["en"])
    temp = tuple(temp)
    test_data.append(temp)

In [None]:
# Define prefix for zero-shot translation
prefix = "translate English to German: "

# Generate translations for validation set
valid_translations = []
valid_references = []
for example in val_data:
    input_text = prefix + example[1]
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids
    output_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    valid_translations.append(translated_text)
    valid_references.append(example[0])

# Generate translations for test set
test_translations = []
test_references = []
for example in test_data:
    input_text = prefix + example[1]
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids
    output_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    test_translations.append(translated_text)
    test_references.append(example[0])

# Load evaluation metrics
bleu_metric = load_metric("bleu")
meteor_metric = load_metric("meteor")
bertscore_metric = load_metric("bertscore")

# Calculate BLEU score
bleu_score_valid = bleu_metric.compute(predictions=valid_translations, references=valid_references)
bleu_score_test = bleu_metric.compute(predictions=test_translations, references=test_references)

# Calculate METEOR score
meteor_score_valid = meteor_metric.compute(predictions=valid_translations, references=valid_references)
meteor_score_test = meteor_metric.compute(predictions=test_translations, references=test_references)

# Calculate BERTScore
bertscore_score_valid = bertscore_metric.compute(predictions=valid_translations, references=valid_references)
bertscore_score_test = bertscore_metric.compute(predictions=test_translations, references=test_references)

# Print evaluation metrics for validation set
print("Validation Set Metrics:")
print(f"BLEU Score: {bleu_score_valid['bleu']}")
print(f"METEOR Score: {meteor_score_valid['meteor']}")
print(f"BERTScore Score: {bertscore_score_valid['bertscore'].mean()}")

# Print evaluation metrics for test set
print("\nTest Set Metrics:")
print(f"BLEU Score: {bleu_score_test['bleu']}")
print(f"METEOR Score: {meteor_score_test['meteor']}")
print(f"BERTScore Score: {bertscore_score_test['bertscore'].mean()}")

Pipeline

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric
import csv

# Load T5 model and tokenizer
model_name = "google-t5/t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Define prefix for zero-shot translation
prefix = "translate English to German: "

# Define function for translation
def translate_sentence(sentence):
    input_text = prefix + sentence
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids
    output_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return translated_text

# Define function to translate English sentences from a CSV file to German
def translate_csv_to_german(input_csv_file, output_csv_file):
    with open(input_csv_file, 'r', encoding='utf-8') as input_file:
        csv_reader = csv.DictReader(input_file)
        rows = list(csv_reader)

    translated_rows = []
    for row in rows:
        english_sentence = row['en']
        translated_sentence = translate_sentence(english_sentence)
        row['de'] = translated_sentence
        translated_rows.append(row)

    with open(output_csv_file, 'w', newline='', encoding='utf-8') as output_file:
        fieldnames = ['en', 'de']
        csv_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        csv_writer.writeheader()
        csv_writer.writerows(translated_rows)

# Example usage:
# Translate a single English sentence to German
english_sentence = "A group of people standing in front of an igloo."
translated_sentence = translate_sentence(english_sentence)
print("Translated Sentence:", translated_sentence)

# Translate sentences from a CSV file to German
input_csv_file = "input_2B.csv"
output_csv_file = "output_task2B.csv"
translate_csv_to_german(input_csv_file, output_csv_file)
print("Translations saved to output.csv")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Translated Sentence: Eine Gruppe von Menschen, die vor einem Iglu stehen.
Translations saved to output.csv
