In [1]:
from datasets import load_dataset
import gc

dataset = load_dataset("wmt20_mlqe_task1", "en-de")

In [2]:
print(dataset)
columns = dataset.column_names
print(f'column names are: \n{columns}')

DatasetDict({
    train: Dataset({
        features: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'],
        num_rows: 1000
    })
})
column names are: 
{'train': ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'], 'test': ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'], 'validation': ['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas']}


In [3]:
def extract_languages(examples):
    targets = [ex["de"] for ex in examples["translation"]]
    inputs = [ex["en"] for ex in examples["translation"]]
    return {"inputs": inputs, "targets": targets}

dataset = dataset.map(extract_languages, batched=True, remove_columns=['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'])
dataset  

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 1000
    })
})

In [4]:
training = dataset["train"]
validation = dataset["validation"]
testing = dataset["test"]

print(f"Length of training data is: {len(training)}")
print(f"Length of validation data is: {len(validation)}")
print(f"Length of testing data is: {len(testing)}")

del dataset
gc.collect()

Length of training data is: 7000
Length of validation data is: 1000
Length of testing data is: 1000


30

In [5]:
print(f"example of how the data look like now:\n {training[5]}")
print(training[5]["inputs"])
print(training[5]["targets"])

example of how the data look like now:
 {'inputs': 'They engaged in crossfire at Guamaní River Bridge, Coamo and Silva Heights and finally at the Battle of Asomante.', 'targets': 'Sie verübten Kreuzfeuer an der Guamaní River Bridge, Coamo und Silva Heights und schließlich an der Schlacht von Asomante.'}
They engaged in crossfire at Guamaní River Bridge, Coamo and Silva Heights and finally at the Battle of Asomante.
Sie verübten Kreuzfeuer an der Guamaní River Bridge, Coamo und Silva Heights und schließlich an der Schlacht von Asomante.


In [6]:
import psutil
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Function to get memory usage in megabytes
def memory_usage_mb():
    return psutil.Process().memory_info().rss / (1024 ** 2)

# Function to get GPU memory usage in MB
def gpu_memory_usage_mb():
    torch.cuda.synchronize()  # Wait for all operations on the GPU to complete
    return torch.cuda.memory_allocated() / (1024 ** 2)

# Measure memory before loading the model
memory_before = memory_usage_mb()
gpu_memory_before = gpu_memory_usage_mb()

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", device_map='cuda')

# Measure memory after loading the model
memory_after = memory_usage_mb()
gpu_memory_after = gpu_memory_usage_mb()

# Calculate and print the difference in memory usage
print(f"System Memory used after loading the model: {memory_after - memory_before:.2f} MB")
print(f"GPU Memory used after loading the model: {gpu_memory_after - gpu_memory_before:.2f} MB")

System Memory used after loading the model: 244.15 MB
GPU Memory used after loading the model: 230.81 MB


In [8]:
import warnings
warnings.filterwarnings("ignore")

source_lang = "inputs"
target_lang = "targets"
prefix = "translate English eo German: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_training_data = training.map(preprocess_function, batched=True)
validation_training_data = validation.map(preprocess_function, batched=True)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [9]:
print(tokenized_training_data[0])

{'inputs': 'José Ortega y Gasset visited Husserl at Freiburg in 1934.', 'targets': '1934 besuchte José Ortega y Gasset Husserl in Freiburg.', 'input_ids': [13959, 1566, 3, 15, 32, 2968, 10, 26816, 4366, 12029, 3, 63, 6435, 2244, 5251, 13674, 7, 49, 40, 44, 30498, 16, 28828, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [28828, 22361, 15, 26816, 4366, 12029, 3, 63, 6435, 2244, 13674, 7, 49, 40, 16, 30498, 5, 1]}


In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir="./results_1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=12,
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=validation_training_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.660553
2,0.939500,0.64957
3,0.882100,0.642341
4,0.867100,0.636562
5,0.837800,0.632857
6,0.823300,0.629307
7,0.820500,0.627437
8,0.809700,0.625632
9,0.809700,0.62452
10,0.797800,0.624075


TrainOutput(global_step=5256, training_loss=0.8348093540882593, metrics={'train_runtime': 642.664, 'train_samples_per_second': 130.706, 'train_steps_per_second': 8.178, 'total_flos': 1067201825341440.0, 'train_loss': 0.8348093540882593, 'epoch': 12.0})

In [12]:
trainer.save_model("./en_de_model_normal_ft")

In [13]:
from transformers import pipeline
import torch
import sacrebleu
import time



def translator_pipeline(model_name, tokenizer_name):
    translator = pipeline("translation_en_to_de", model=model_name, tokenizer=tokenizer_name)
    return translator

def translate_and_evaluate(test_dataset):
    translations = []
    references = []

    for item in test_dataset:
        # Translate each sentence
        english_sentence = item['inputs']
        german_translation = translator(english_sentence, max_length=128, truncation=True)[0]['translation_text']

        # Append the result and the reference translation
        translations.append(german_translation)
        references.append(item['targets'])

    return translations, references


# Function to calculate BLEU score using SacreBLEU
def calculate_bleu_score(translations, references):
    bleu = sacrebleu.corpus_bleu(translations, [references])
    return bleu.score


# Load the trained model
model_path = "./en_de_model_normal_ft"  
our_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
our_tokenizer = AutoTokenizer.from_pretrained(model_path)

translator = translator_pipeline(our_model, our_tokenizer)


start_time = time.time()
translations, references = translate_and_evaluate(testing)

# Calculate the BLEU score
bleu_score = calculate_bleu_score(translations, references)
end_time = time.time()

print(f"BLEU score for the normally fine-tuned t5 model is: {bleu_score}")
print(f"Execution time: {end_time - start_time} seconds")

BLEU score for the normally fine-tuned t5 model is: 49.42898092092283
Execution time: 326.48990988731384 seconds
