In [None]:
'''
 Natural Language Inference with Transformer
 This script implements a Natural Language Inference (NLI) model using RoBERTa-large
 with custom data augmentation and training optimizations.
 '''

## Table of Contents
1. Setup and Imports
2. Model Architecture
3. Data Augmentation
4. Training Utilities
5. Training Loop
6. Evaluation
7. Inference

## 1. Setup and Imports

In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
from tqdm import tqdm
import nltk
import random
from typing import List, Tuple, Dict, Optional
import pandas as pd
from nltk.corpus import wordnet

# Download required NLTK data
nltk.download('wordnet')

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check for available device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Model Architecture

In [4]:
class TransformerModel(nn.Module):
    def __init__(self, model_name: str = "roberta-large", num_labels: int = 2):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)

        # Freeze all layers except the last 4
        for param in self.model.parameters():
            param.requires_grad = False

        # Unfreeze last 4 layers
        for layer in self.model.encoder.layer[-4:]:
            for param in layer.parameters():
                param.requires_grad = True

        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.2),

            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.2),

            nn.Linear(512, num_labels)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        logits = self.classifier(cls_output)

        return logits

# Initialize tokenizer
def get_tokenizer(model_name: str = "roberta-large"):
    return AutoTokenizer.from_pretrained(model_name)

## 3. Data Augmentation

In [5]:
class NLIAugmentor:
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(NLIAugmentor, cls).__new__(cls)
        return cls._instance

    def __init__(self):
        if not hasattr(self, 'initialized'):
            self.initialized = True
            self.tokenizer = get_tokenizer()

    def get_synonyms(self, word: str) -> List[str]:
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word:
                    synonyms.append(lemma.name())
        return list(set(synonyms))

    def replace_with_synonyms(self, text: str, num_replacements: int = 2) -> str:
        words = text.split()
        if len(words) <= num_replacements:
            return text

        # Get indices of words that can be replaced
        replaceable_indices = [i for i, word in enumerate(words)
                             if len(self.get_synonyms(word)) > 0]

        if len(replaceable_indices) < num_replacements:
            return text

        # Randomly select indices to replace
        selected_indices = random.sample(replaceable_indices, num_replacements)

        # Replace selected words with synonyms
        for idx in selected_indices:
            synonyms = self.get_synonyms(words[idx])
            if synonyms:
                words[idx] = random.choice(synonyms)

        return ' '.join(words)

    def augment(self, premise: str, hypothesis: str, label: int) -> List[Tuple[str, str, int]]:
        augmented_pairs = [(premise, hypothesis, label)]  # Original pair

        # Create variations with premise modifications
        aug_premise = self.replace_with_synonyms(premise)
        if aug_premise != premise:
            augmented_pairs.append((aug_premise, hypothesis, label))

        # Create variations with hypothesis modifications
        aug_hypothesis = self.replace_with_synonyms(hypothesis)
        if aug_hypothesis != hypothesis:
            augmented_pairs.append((premise, aug_hypothesis, label))

        return augmented_pairs

## 4. Training Utilities

In [6]:
class NLIDataset(Dataset):
    def __init__(self, premises: List[str], hypotheses: List[str], labels: List[int],
                 tokenizer, max_length: int = 128, augment: bool = False):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        self.augmentor = NLIAugmentor() if augment else None

        # Store original data
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels

        # Create augmented dataset if needed
        if augment:
            self.augmented_data = self._create_augmented_dataset()
        else:
            self.augmented_data = list(zip(premises, hypotheses, labels))

    def _create_augmented_dataset(self) -> List[Tuple[str, str, int]]:
        augmented_data = []
        for premise, hypothesis, label in zip(self.premises, self.hypotheses, self.labels):
            augmented_pairs = self.augmentor.augment(premise, hypothesis, label)
            augmented_data.extend(augmented_pairs)
        return augmented_data

    def __len__(self):
        return len(self.augmented_data)

    def __getitem__(self, idx):
        premise, hypothesis, label = self.augmented_data[idx]

        # Combine premise and hypothesis with [SEP] token
        text = f"{premise} [SEP] {hypothesis}"

        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_epoch(model, train_loader, optimizer, scheduler, device, scaler=None):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if scaler is not None:
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = F.cross_entropy(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = F.cross_entropy(outputs, labels)

            total_loss += loss.item()

            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return {
        'loss': total_loss / len(val_loader),
        'accuracy': accuracy,
        'f1': f1
    }

## 5. Training Loop

In [7]:
def train_model(train_data, val_data, model_name="roberta-large",
                batch_size=16, num_epochs=5, learning_rate=2e-5,
                max_length=128, augment=True):

    # Initialize tokenizer and model
    tokenizer = get_tokenizer(model_name)
    model = TransformerModel(model_name=model_name).to(device)

    # Create datasets
    train_dataset = NLIDataset(
        train_data['premise'],
        train_data['hypothesis'],
        train_data['label'],
        tokenizer,
        max_length=max_length,
        augment=augment
    )

    val_dataset = NLIDataset(
        val_data['premise'],
        val_data['hypothesis'],
        val_data['label'],
        tokenizer,
        max_length=max_length,
        augment=False
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader)
    )

    # Initialize gradient scaler for mixed precision training (if using CUDA)
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

    # Training loop
    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        # Train
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, scaler)
        print(f"Training Loss: {train_loss:.4f}")

        # Evaluate
        val_metrics = evaluate(model, val_loader, device)
        print(f"Validation Loss: {val_metrics['loss']:.4f}")
        print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"Validation F1: {val_metrics['f1']:.4f}")

        # Early stopping
        if val_metrics['loss'] < best_val_loss:
            best_val_loss = val_metrics['loss']
            patience_counter = 0
            # Save best model
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    return model, tokenizer

## 6. Evaluation

In [8]:
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    conf_matrix = confusion_matrix(all_labels, all_preds)

    print(f"\nTest Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"\nConfusion Matrix:")
    print(conf_matrix)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

## 7. Inference

In [9]:
def load_saved_model(model_path, model_name="roberta-large"):
    """Load a saved model from checkpoint."""
    # Initialize model architecture
    model = TransformerModel(model_name=model_name)

    # Load state dict
    if os.path.exists(model_path):
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        print(f"Model loaded from {model_path}")
    else:
        print(f"Warning: {model_path} not found. Using untrained model.")

    model.to(device)
    return model

def load_and_predict(model, tokenizer, input_file='data/dev.csv', output_file='predictions.csv', augment=False):
    """Load data from CSV, make predictions, and save results."""
    # Load data
    df = pd.read_csv(input_file)

    dummy_labels = [0] * len(df)

    # Create dataset
    dataset = NLIDataset(
        premises=df['premise'],
        hypotheses=df['hypothesis'],
        labels=dummy_labels,
        tokenizer=tokenizer,
        max_length=128,
        augment=augment
    )

    dataloader = DataLoader(dataset, batch_size=16)

    # Make predictions
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())


    # Save predictions
    pd.DataFrame({'prediction': predictions}).to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

    # If labels exist, calculate metrics
    if 'label' in df.columns:
        from sklearn.metrics import accuracy_score, f1_score
        accuracy = accuracy_score(df['label'], predictions)
        f1 = f1_score(df['label'], predictions)
        print(f"\nEvaluation Metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")

    return predictions

In [11]:

train_data = pd.read_csv("/content/train.csv")
val_data = pd.read_csv("/content/dev.csv")

model, tokenizer = train_model(
    train_data=train_data,
    val_data=val_data,
    model_name="roberta-large",
    batch_size=16,
    num_epochs=5,
    augment=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None



Epoch 1/3


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 4506/4506 [10:11<00:00,  7.37it/s]


Training Loss: 0.5130


Evaluating: 100%|██████████| 421/421 [02:23<00:00,  2.94it/s]


Validation Loss: 0.3234
Validation Accuracy: 0.8621
Validation F1: 0.8621

Epoch 2/3


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 4506/4506 [10:14<00:00,  7.33it/s]


Training Loss: 0.3051


Evaluating: 100%|██████████| 421/421 [02:23<00:00,  2.94it/s]


Validation Loss: 0.3588
Validation Accuracy: 0.8768
Validation F1: 0.8765

Epoch 3/3


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 4506/4506 [10:20<00:00,  7.26it/s]


Training Loss: 0.1916


Evaluating: 100%|██████████| 421/421 [02:22<00:00,  2.95it/s]

Validation Loss: 0.4153
Validation Accuracy: 0.8731
Validation F1: 0.8729





In [13]:
# Load saved model.
model_path = "/content/best_model.pt"
model = load_saved_model(model_path=model_path)
tokenizer = get_tokenizer()

predictions = load_and_predict(model, tokenizer, input_file='/content/test.csv', output_file='predictions.csv')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from /content/best_model.pt
Predictions saved to predictions.csv
