In [None]:
pip install datasets transformers evaluate adapters



In [None]:
pip install sacrebleu



In [None]:
from datasets import load_dataset
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, Adafactor, DataCollatorForSeq2Seq
import sacrebleu
import numpy as np
import tensorboard
import evaluate
from safetensors.torch import load_file
from adapters import AdapterConfig, T5AdapterModel


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_nmt = load_dataset('json', data_files='/content/drive/MyDrive/Assignment 2/wmt_en_de_train.json')
validation_nmt = load_dataset('json', data_files= '/content/drive/MyDrive/Assignment 2/wmt_en_de_validation.json')
config = T5Config.from_pretrained('t5-base')
model = T5ForConditionalGeneration(config)
tokenizer = T5Tokenizer.from_pretrained('T5-base',use_fast = True)
data_collator = DataCollatorForSeq2Seq(tokenizer= tokenizer, model = model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from torch.utils.data import IterableDataset
print (dataset_nmt)
# Split dataset and remove unwanted columns
data_splits_nmt = dataset_nmt['train'].select(range(20_000)).train_test_split(test_size=0.2, shuffle=True)
print((data_splits_nmt))
# Check the number of rows
print(f"Train set size: {data_splits_nmt['train'].num_rows}")
print(f"Test set size: {data_splits_nmt['test'].num_rows}")
print(data_splits_nmt)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 34782245
    })
})
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 4000
    })
})
Train set size: 16000
Test set size: 4000
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 4000
    })
})


In [None]:


# Preprocess function to tokenize the data
def preprocess_function(examples, tokenizer, input_length = 512):
    inputs = ['Translate from English to German: ' + ex['en'] for ex in examples['translation']]  # English source text
    targets = [ex['de'] for ex in examples['translation']]  # German target text

    # Tokenize the inputs (source text)
    model_inputs = tokenizer(inputs, max_length=input_length, truncation=True, padding = True)

    # Tokenize the targets (target text)
    labels = tokenizer(targets, max_length=input_length, truncation=True, padding = True)

    # Set the labels in the model inputs
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

tokenized_train_dataset = data_splits_nmt['train'].map(preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': tokenizer,
                'input_length': 512,
            },
            remove_columns= 'translation',
            writer_batch_size = 10000)
tokenized_test_dataset = data_splits_nmt['test'].map(preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': tokenizer,
                'input_length': 512,
            },
            remove_columns= 'translation',
            writer_batch_size = 10000)
tokenized_validated_dataset = validation_nmt['train'].map(preprocess_function,
            batch_size = 10000,
            batched= True,
            fn_kwargs= {'tokenizer': tokenizer, 'input_length' : 512},
            remove_columns= 'translation',
            writer_batch_size= 10_000)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import evaluate

bleu_metric = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode labels, replacing -100 with the pad token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip unnecessary whitespaces
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # BLEU expects a list of references for each prediction

    # Compute BLEU score
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {"bleu": result["bleu"]}


In [None]:
#without pretraining

# Define your training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/Assignment 2/Submission/results',
    eval_strategy="steps",
    eval_steps=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size =32,
    dataloader_num_workers=8,
    num_train_epochs=3,
    learning_rate=0.01,
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    predict_with_generate=True,
    logging_dir='/content/drive/MyDrive/Assignment 2/Submission/logs',
    logging_steps=20,
    save_total_limit=2,
    save_steps=20,
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="tensorboard",
    generation_max_length= 256
)

# Initialize the Trainer with AdaFactor optimizer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    optimizers=(Adafactor(model.parameters(), lr=0.001, relative_step=False), None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()
trainer.evaluate(tokenized_validated_dataset)


Step,Training Loss,Validation Loss,Bleu
20,1.1482,1.385659,0.0
40,1.0819,1.309365,0.0
60,1.0384,1.255258,0.001424
80,1.0001,1.217483,0.001536
100,0.9607,1.190291,0.001233
120,0.9588,1.170023,0.001571
140,0.9403,1.157143,0.002507


Step,Training Loss,Validation Loss,Bleu
20,1.1482,1.385659,0.0
40,1.0819,1.309365,0.0
60,1.0384,1.255258,0.001424
80,1.0001,1.217483,0.001536
100,0.9607,1.190291,0.001233
120,0.9588,1.170023,0.001571
140,0.9403,1.157143,0.002507
160,0.9194,1.149024,0.001769
180,0.9202,1.144609,0.001961


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 1.984094500541687,
 'eval_bleu': 0.0,
 'eval_runtime': 558.7955,
 'eval_samples_per_second': 5.365,
 'eval_steps_per_second': 0.168,
 'epoch': 2.976}

In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Assignment 2/Submission/checkpoints_not_pretrained/checkpoint-186')

In [None]:
#Translation of the unpretrained model after finetuning
import torch
for batch_index, batch in enumerate(tokenized_validated_dataset):
    if batch_index > 10:
        break
    inputs = batch['input_ids']
    inputs_tensor = torch.tensor([inputs]).to('cuda')
    label_tensor = torch.tensor([batch['labels']]).to('cuda')
    # Ensure model2 is in evaluation mode if necessary
    model.to('cuda')
    model.eval()

    # Get model output
    with torch.no_grad():
        outputs = model.generate(inputs_tensor)

    # Decoding the model output (assuming outputs are logits)
    decoded_label = tokenizer.batch_decode(label_tensor, skip_special_tokens=True)
    decoded_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"Decoded labels: {decoded_label}")
    print(f"Decoded sentence: {decoded_sentence}")



Decoded labels: ['München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern']
Decoded sentence: ['Die hat die Kommission hat.']
Decoded labels: ['Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.']
Decoded sentence: ['Die es es es.']
Decoded labels: ['Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.']
Decoded sentence: ['Die es es es es es es']
Decoded labels: ['Kleingärtner bewirtschaften den einstigen Grund von Bauern.']
Decoded sentence: ['Die hat hat die Kommission hat.']
Decoded labels: ['Die älteste offizielle Karte Münchens fördert spannende Geschichten zu Tage.']
Decoded sentence: ['Die es hat.']
Decoded labels: ['Es nervt, wenn Landkarten nicht aktuell sind.']
Decoded sentence: ['Die es nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht nicht']
Decoded labels: ['Das kennt jeder, der sich schon mal aufregen musste, weil das Auto-Navi statt einer Umgehungsstraße eine grüne Wiese anzeigte.']
Decoded sentence: ['

In [None]:

#For pretrained T5
# Load the tokenizer for T5
tokenizer = T5Tokenizer.from_pretrained("t5-base")  # Replace with your specific model if needed
config = T5Config.from_pretrained("t5-base")
# Load the SafeTensors file
file_path = "/content/drive/MyDrive/checkpoint-26700/model.safetensors"
model_weights = load_file(file_path)

# Initialize the T5 model
model2 = T5ForConditionalGeneration(config)  # Replace with your model if needed

# Load the weights into the model
model2.load_state_dict(model_weights, strict = False )

# Define your training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/Assignment 2/Submission/results',
    eval_strategy="steps",
    eval_steps=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size =32,
    dataloader_num_workers=8,
    num_train_epochs=3,
    learning_rate=0.01,
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    predict_with_generate=True,
    logging_dir='/content/drive/MyDrive/Assignment 2/Submission/logs',
    logging_steps=20,
    save_total_limit=2,
    save_steps=20,
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="tensorboard",
    generation_max_length= 256
)

# Initialize the Trainer with AdaFactor optimizer
trainer = Seq2SeqTrainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    optimizers=(Adafactor(model2.parameters(), lr=0.001, relative_step=False), None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss,Bleu
20,3.8638,1.702601,0.0
40,1.2212,1.464314,0.0
60,1.1165,1.37524,0.0
80,1.0578,1.320515,0.0
100,1.0203,1.282874,0.0
120,1.0081,1.257433,0.0
140,0.9782,1.241369,0.0


Step,Training Loss,Validation Loss,Bleu
20,3.8638,1.702601,0.0
40,1.2212,1.464314,0.0
60,1.1165,1.37524,0.0
80,1.0578,1.320515,0.0
100,1.0203,1.282874,0.0
120,1.0081,1.257433,0.0
140,0.9782,1.241369,0.0
160,0.9739,1.231695,0.0
180,0.9565,1.227472,0.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=186, training_loss=1.3434423656873806, metrics={'train_runtime': 9016.7673, 'train_samples_per_second': 5.323, 'train_steps_per_second': 0.021, 'total_flos': 1.30256092790784e+16, 'train_loss': 1.3434423656873806, 'epoch': 2.976})

In [None]:
trainer.evaluate(tokenized_validated_dataset)

{'eval_loss': 1.9622403383255005,
 'eval_bleu': 0.0,
 'eval_runtime': 620.5351,
 'eval_samples_per_second': 4.831,
 'eval_steps_per_second': 0.151,
 'epoch': 2.976}

In [None]:
#Translation of pretrained model after finetuning
import torch
for batch_index, batch in enumerate(tokenized_validated_dataset):
    if batch_index > 10:
        break
    inputs = batch['input_ids']
    inputs_tensor = torch.tensor([inputs]).to('cuda')
    label_tensor = torch.tensor([batch['labels']]).to('cuda')
    # Ensure model2 is in evaluation mode if necessary
    model2.eval()

    # Get model output
    with torch.no_grad():
        outputs = model2.generate(inputs_tensor)

    # Decoding the model output (assuming outputs are logits)
    decoded_label = tokenizer.batch_decode(label_tensor, skip_special_tokens=True)
    decoded_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"Decoded labels: {decoded_label}")
    print(f"Decoded sentence: {decoded_sentence}")


Decoded labels: ['München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern']
Decoded sentence: ['ich ich ich ich ich ich ich ich ich']
Decoded labels: ['Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.']
Decoded sentence: ['Ich ich es es es es es']
Decoded labels: ['Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.']
Decoded sentence: ['Ich ich ich ich ich es es es,']
Decoded labels: ['Kleingärtner bewirtschaften den einstigen Grund von Bauern.']
Decoded sentence: ['Die es es es es es es']
Decoded labels: ['Die älteste offizielle Karte Münchens fördert spannende Geschichten zu Tage.']
Decoded sentence: ['Ich es es es es es es']
Decoded labels: ['Es nervt, wenn Landkarten nicht aktuell sind.']
Decoded sentence: ['Ich ich ich ich ich ich ich ich ich ich']
Decoded labels: ['Das kennt jeder, der sich schon mal aufregen musste, weil das Auto-Navi statt einer Umgehungsstraße eine grüne Wiese anzeigte.']
Decoded sentence: ['Ich ich ich ich ich ich ich ich 