In [1]:
from datasets import load_dataset
import gc

dataset = load_dataset("wmt20_mlqe_task1", "en-de")

In [2]:
def extract_languages(examples):
    targets = [ex["de"] for ex in examples["translation"]]
    inputs = [ex["en"] for ex in examples["translation"]]
    return {"inputs": inputs, "targets": targets}

dataset = dataset.map(extract_languages, batched=True, remove_columns=['segid', 'translation', 'scores', 'mean', 'z_scores', 'z_mean', 'model_score', 'doc_id', 'nmt_output', 'word_probas'])
dataset  

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 1000
    })
})

In [3]:
training = dataset["train"]
validation = dataset["validation"]
testing = dataset["test"]

print(f"Length of training data is: {len(training)}")
print(f"Length of validation data is: {len(validation)}")
print(f"Length of testing data is: {len(testing)}")

del dataset
gc.collect()

Length of training data is: 7000
Length of validation data is: 1000
Length of testing data is: 1000


30

In [4]:
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import psutil


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


# Function to get memory usage in megabytes
def memory_usage_mb():
    return psutil.Process().memory_info().rss / (1024 ** 2)

# Function to get GPU memory usage in MB
def gpu_memory_usage_mb():
    torch.cuda.synchronize()  # Wait for all operations on the GPU to complete
    return torch.cuda.memory_allocated() / (1024 ** 2)


# Measure memory before loading the model
memory_before = memory_usage_mb()
gpu_memory_before = gpu_memory_usage_mb()

tokenizer_q = AutoTokenizer.from_pretrained("t5-small")
model_q = AutoModelForSeq2SeqLM.from_pretrained("t5-small", quantization_config=nf4_config)

# Measure memory after loading the model
memory_after = memory_usage_mb()
gpu_memory_after = gpu_memory_usage_mb()

# Calculate and print the difference in memory usage
print(f"System Memory used after loading the model: {memory_after - memory_before:.2f} MB")
print(f"GPU Memory used after loading the model: {gpu_memory_after - gpu_memory_before:.2f} MB")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


System Memory used after loading the model: 423.18 MB
GPU Memory used after loading the model: 95.62 MB


In [5]:
from peft import prepare_model_for_kbit_training

model_q.gradient_checkpointing_enable()
model_q = prepare_model_for_kbit_training(model_q)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32,  
    lora_dropout=0.05, 
    bias="none"
)

model_q = get_peft_model(model_q, config)
print_trainable_parameters(model_q)

trainable params: 294912 || all params: 45072896 || trainable%: 0.6543000920109504


In [9]:
source_lang = "inputs"
target_lang = "targets"
prefix = "translate English eo German: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer_q(inputs, max_length=128, truncation=True)

    with tokenizer_q.as_target_tokenizer():
        labels = tokenizer_q(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_training_data = training.map(preprocess_function, batched=True)
validation_training_data = validation.map(preprocess_function, batched=True)

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer_q, model=model_q)

2024-04-21 17:13:01.425194: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-21 17:13:01.453880: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings

warnings.filterwarnings("ignore")


training_args = Seq2SeqTrainingArguments(
    output_dir="./results_2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=12,
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model_q,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=validation_training_data,
    tokenizer=tokenizer_q,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,No log
2,0.986200,No log
3,0.963200,No log
4,0.965400,No log
5,0.953100,No log
6,0.938600,No log
7,0.951800,No log
8,0.940400,No log
9,0.940400,No log
10,0.941100,No log


TrainOutput(global_step=5256, training_loss=0.9515452101894709, metrics={'train_runtime': 922.5745, 'train_samples_per_second': 91.05, 'train_steps_per_second': 5.697, 'total_flos': 1074345608478720.0, 'train_loss': 0.9515452101894709, 'epoch': 12.0})

In [13]:
trainer.save_model("./en_de_model_qlora")

In [14]:
from transformers import pipeline
import torch
import sacrebleu
import time


def translator_pipeline(model_name, tokenizer_name):
    translator = pipeline("translation_en_to_de", model=model_name, tokenizer=tokenizer_name)
    return translator

def translate_and_evaluate(test_dataset):
    translations = []
    references = []

    for item in test_dataset:
        # Translate each sentence
        english_sentence = item['inputs']
        german_translation = translator(english_sentence, max_length=128, truncation=True)[0]['translation_text']

        # Append the result and the reference translation
        translations.append(german_translation)
        references.append(item['targets'])

    return translations, references


# Function to calculate BLEU score using SacreBLEU
def calculate_bleu_score(translations, references):
    bleu = sacrebleu.corpus_bleu(translations, [references])
    return bleu.score


# Load the trained model
model_path = "./en_de_model_qlora"  
our_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
our_tokenizer = AutoTokenizer.from_pretrained(model_path)

translator = translator_pipeline(our_model, our_tokenizer)


start_time = time.time()
translations, references = translate_and_evaluate(testing)

# Calculate the BLEU score
bleu_score = calculate_bleu_score(translations, references)
end_time = time.time()

print(f"BLEU score for the quantized fine tuned t5 model is: {bleu_score}")
print(f"Execution time: {end_time - start_time} seconds")

BLEU score for the quantized fine tuned t5 model is: 46.83953379552536
Execution time: 367.23499274253845 seconds
