HW-3 NLP Implementing transformer

Data Preperation

In [17]:
import pandas as pd

# Load the dataset
dataset_path = '/content/eng-french.csv'
df = pd.read_csv(dataset_path)

# Check the number of sentence pairs
num_pairs = len(df)
print(f"Number of sentence pairs in the dataset: {num_pairs}")


Number of sentence pairs in the dataset: 175621


In [None]:
!pip install transformers nltk




In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu


Dataset Class for TranslationDataset class

In [None]:
# Custom Dataset class to work with PyTorch's DataLoader
class TranslationDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs['input_ids'][idx],
            "attention_mask": self.inputs['attention_mask'][idx],
            "labels": self.targets['input_ids'][idx],
        }


Tokenize the Data

In [None]:
# Initialize the BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Function to tokenize data with a reduced max_length
def tokenize_data(df, tokenizer, max_length=40):
    english_inputs = tokenizer(
        df['english'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    french_targets = tokenizer(
        df['french'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return english_inputs, french_targets


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]



 Load Dataset and Split it into Train, Validation, and Test

In [None]:
# Load dataset and prepare data
dataset_path = '/content/eng-french.csv'  # Adjust the path to your dataset
df = pd.read_csv(dataset_path)

# Limit to 70,000 samples randomly
df = df.sample(n=70000, random_state=42)
df.columns = ['english', 'french']

#converting to lower for consistency
df['english'] = df['english'].str.lower()
df['french'] = df['french'].str.lower()

# Split the dataset into training (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Further split the temp_df into validation (50%) and test (50%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Tokenize the training, validation, and test data
train_inputs, train_targets = tokenize_data(train_df, tokenizer)
val_inputs, val_targets = tokenize_data(val_df, tokenizer)
test_inputs, test_targets = tokenize_data(test_df, tokenizer)

# Create Dataset objects for training, validation, and test
train_dataset = TranslationDataset(train_inputs, train_targets)
val_dataset = TranslationDataset(val_inputs, val_targets)
test_dataset = TranslationDataset(test_inputs, test_targets)


Model and Tokenizer Initialization

In [None]:
# Load BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# BART for encoder-decoder model setup
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Set model configuration (this step remains the same)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 40  # Reduced max_length for efficiency
model.config.no_repeat_ngram_size = 2
model.config.num_beams = 4
model.config.early_stopping = True

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)




pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Defining the training arguments

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Directory where checkpoints are saved
    evaluation_strategy="epoch",       # Evaluate after each epoch
    save_strategy="epoch",             # Save checkpoints at the end of each epoch
    save_steps=500,                    # Optionally, save checkpoints every 500 steps
    per_device_train_batch_size=2,     # Batch size
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,     # Simulate larger batch size
    learning_rate=5e-5,
    num_train_epochs=5,                # Total number of epochs
    logging_dir='./logs',              # Directory for logs
    fp16=True,                         # Mixed precision training
    save_total_limit=2,                # Limit the number of saved checkpoints to save disk space
    load_best_model_at_end=True,       # Load best model based on evaluation
    metric_for_best_model="eval_loss", # Use validation loss to determine the best model
)




Initialize Trainer and Start Training

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model (first time, no resumption)
trainer.train()  # This will train from scratch, no checkpoint resumption

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

BLEU Score Calculation Using NLTK

In [None]:
import torch
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import pandas as pd

# Assuming `tokenize_data` and `TranslationDataset` are defined in your earlier code
# Load and sample the validation and test datasets
val_df_sampled = val_df.sample(n=500, random_state=42)
test_df_sampled = test_df.sample(n=500, random_state=42)

# Tokenize the sampled data
val_inputs, val_targets = tokenize_data(val_df_sampled, tokenizer)
test_inputs, test_targets = tokenize_data(test_df_sampled, tokenizer)

# Create Dataset objects for the sampled subsets
sampled_val_dataset = TranslationDataset(val_inputs, val_targets)
sampled_test_dataset = TranslationDataset(test_inputs, test_targets)

# Define DataLoader for validation and test subsets
sampled_val_loader = DataLoader(sampled_val_dataset, batch_size=4)  # Adjust batch size as needed
sampled_test_loader = DataLoader(sampled_test_dataset, batch_size=4)

# Function to generate predictions
def generate_predictions(model, data_loader, tokenizer, device):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate predictions using greedy decoding
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=30, num_beams=4,early_stopping=True)

            # Decode predictions and labels
            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            predictions.extend(preds)
            references.extend([[ref] for ref in labels])  # BLEU expects references as a list of lists

    return predictions, references

# Function to calculate BLEU score
def calculate_bleu(predictions, references):
    preds_tokenized = [p.split() for p in predictions]
    refs_tokenized = references

    smoothie = SmoothingFunction().method1  # Apply smoothing
    bleu_scores = [sentence_bleu(refs_tokenized[i], preds_tokenized[i], smoothing_function=smoothie) for i in range(len(predictions))]

    return sum(bleu_scores) / len(bleu_scores)


# Generate predictions and calculate BLEU score on the validation set subset
val_predictions, val_references = generate_predictions(model, sampled_val_loader, tokenizer, device)
val_bleu_score = calculate_bleu(val_predictions, val_references)
print(f"Validation BLEU Score (Sampled 500): {val_bleu_score}")

# Generate predictions and calculate BLEU score on the test set subset
test_predictions, test_references = generate_predictions(model, sampled_test_loader, tokenizer, device)
test_bleu_score = calculate_bleu(test_predictions, test_references)
print(f"Test BLEU Score (Sampled 500): {test_bleu_score}")


In [None]:
!ls ./results
