In [5]:
pip install sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset ,load_metric
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
import torch.nn.utils.prune as prune
from transformers import TrainingArguments,Trainer
import sacrebleu

In [7]:
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [8]:
# Load WMT 2014 English-German dataset
dataset = load_dataset("wmt14", "de-en",split = 'train')
test_dataset = load_dataset('wmt14',"de-en",split = 'test')

val_dataset = load_dataset('wmt14',"de-en",split = 'validation')

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 280M/280M [00:01<00:00, 236MB/s]  
Downloading data: 100%|██████████| 265M/265M [00:01<00:00, 220MB/s]  
Downloading data: 100%|██████████| 273M/273M [00:01<00:00, 182MB/s]  
Downloading data: 100%|██████████| 474k/474k [00:00<00:00, 1.50MB/s]
Downloading data: 100%|██████████| 509k/509k [00:00<00:00, 1.02MB/s]


Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [9]:
sampled_dataset = dataset.shuffle(seed=42).select(range(int(0.001 * len(dataset))))
# test_sample_dataset = test_dataset.shuffle(seed = 42).select(range(int(0.01*len(test_dataset))))

val_sample_dataset = val_dataset.shuffle(seed = 42).select(range(int(0.01*len(val_dataset))))

In [10]:
# Function to preprocess the data
def preprocess_function(examples):
    # Extracting German and English texts from the 'translation' dictionary
    inputs = [ex['de'] for ex in examples['translation']]
    targets = [ex['en'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding='max_length')

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=1024, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [32]:
# Tokenize the dataset
tokenized_datasets = sampled_dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
# test_tokenized_datasets = test_dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

val_tokenized_datasets = val_sample_dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

In [14]:
#  Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
#     predict_with_generate=True
)

In [15]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    eval_dataset=val_tokenized_datasets,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
import wandb
wandb.init(mode="disabled")



In [17]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1068,0.068782


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=1127, training_loss=0.6717010030941087, metrics={'train_runtime': 1040.1878, 'train_samples_per_second': 4.334, 'train_steps_per_second': 1.083, 'total_flos': 2748691953745920.0, 'train_loss': 0.6717010030941087, 'epoch': 1.0})

In [18]:
model.eval()  # Set the model to evaluation mode
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [20]:
# quantized_model.save_pretrained('./quantized_bart_model')

In [23]:
# test_dataset = dataset.map(preprocess_function, batched=True)

In [30]:
def generate_translation(batch):
    # Assuming batch['translation'] is a list of dictionaries
    german_sentences = [item['de'] for item in batch['translation']]
    english_sentences = [item['en'] for item in batch['translation']]

    # Tokenize the German sentences
    inputs = tokenizer(german_sentences, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}

    # Generate outputs
    outputs = model.generate(**inputs, max_length=512, num_beams=5)

    # Decode the outputs to human-readable translations
    translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"pred_translation": translations}

In [34]:
# Apply translation generation function to the test dataset
results =test_dataset.map(generate_translation, batched=True, batch_size=16)

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [35]:
# Extract the translations and references
translations = [result['pred_translation'] for result in results]
references = [[ref['en']] for ref in results['translation']] 

In [36]:
# Compute BLEU score using sacrebleu
bleu = sacrebleu.corpus_bleu(translations, references)
print(f"BLEU Score: {bleu.score}")

BLEU Score: 20.556680845025987
