# Natural Language Inference

This Jupyter Notebook fine-tunes `microsoft/mdeberta-v3-base` on the `Contradictory, My Dear Watson` dataset.
It includes data loading, preprocessing, data augmentation, model training, evaluation, and visualization.

### Environment setup

In [None]:
# Install dependencies
%pip install -qU pandas numpy "torch<2.6" seaborn matplotlib scikit-learn transformers[torch] datasets nltk nlpaug
# %pip install -qU protobuf tiktoken sentencepiece # depends on models used

#### Import libraries

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, concatenate_datasets

# Data augmentation
from nlpaug.augmenter.word import WordAugmenter, RandomWordAug, SynonymAug
from nlpaug.augmenter.char import KeyboardAug

In [None]:
ROOT_PATH = Path.cwd()
# Use fixed seed for results reproducibility
np.random.seed(0)

Little trick to prevent nltk downloading outputs:

In [None]:
def nltk_silent_download(*args, func=nltk.download, **kwargs):
    return func(*args, quiet=True, **kwargs)

nltk.download = nltk_silent_download

### Data loading

In [None]:
df = pd.read_csv(ROOT_PATH / "data/train.csv")
df.sample(10)

Display label distribution: the dataset is evenly distributed across all labels.

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x=df.label.replace({0: "Entailment", 1: "Neutral", 2: "Contradiction"}))
plt.xlabel(None)
plt.ylabel("Count")
plt.title("Pair-wise sentences distribution")
plt.show()

However, the dataset predominantly consists of English sentences.

In [None]:
labels, frequencies = np.unique(df.language.values, return_counts = True)

plt.figure(figsize = (10, 10))
plt.pie(frequencies, labels=labels, autopct='%1.1f%%')
plt.show()

### Data Augmentation

Use `NLPAug` library to augment the data by using synonyms, typo insertion and word swapping.

In [None]:
def augment_text(text, augmenter):
    try:
        result = augmenter.augment(text)
        # Handle list outputs from some augmenters
        return result[0] if isinstance(result, list) else str(result)
    except Exception as e:
        return str(text)  # Ensure string return

def augment_df(df, augmenters, sample_frac=0.5):
    sample = df.sample(frac=sample_frac)
    augmented = []

    for _, row in tqdm(sample.iterrows(), total=len(sample)):
        for aug, weight in augmenters:
            if np.random.random() > weight:
                continue  # Skip this augmentation

            new_row = row.copy()
            premise = new_row['premise'] = augment_text(row['premise'], aug)
            hypothesis = new_row['hypothesis'] = augment_text(row['hypothesis'], aug)
            if premise != row["premise"] or hypothesis != row["hypothesis"]:
                augmented.append(new_row)

    return pd.DataFrame(augmented).convert_dtypes()

Define augmentation strategies and their probability to be used for each sentence.

In [None]:
augmenters = [
    (SynonymAug(aug_src='wordnet', aug_p=0.1), 0.8),   # Synonym replacement
    (RandomWordAug(action='swap', aug_p=0.1), 0.1),    # Word swapping
    (KeyboardAug(aug_char_p=0.1, aug_word_p=0.1), 0.3) # Typo simulation
]

df_aug = augment_df(df, augmenters)
print(f"Adding {len(df_aug)} new examples")

### Model configuration

**Architecture:**  
- `microsoft/mdeberta-v3-base` transformer from HuggingFace
- Classification head with dropout (0.3)

**Training Parameters:**
- Batch size: 8 (train), 8 (eval)
- Learning rate: 2e-5
- Epochs: 5
- Weight decay: 0.01
- Warmup ratio: 0.1

Train using the `Trainer` API.

#### Define the model

In [None]:
MODEL_NAME = "FacebookAI/xlm-roberta-large"

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.classifier.dropout = torch.nn.Dropout(0.3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available

#### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(row):
    return tokenizer(
        row['premise'],
        row['hypothesis'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

# Dataset tokenized
dataset = Dataset.from_pandas(df).map(tokenize_function, batched=True)
dataset_aug = Dataset.from_pandas(df_aug).map(tokenize_function, batched=True)

#### Train-test split:
- **Test set** is sampled from original dataset
- **Train set** is a concatenation of the sampled train part with augmented data

In [None]:
# Train-test split
split_set = dataset.train_test_split(test_size=0.3, seed=0)
# Concatenate the augmented set and shuffle the whole set
train_set = concatenate_datasets([split_set['train'], dataset_aug]).shuffle()
test_set = split_set['test']

#### Training hyperparameters

In [None]:
model_path = ROOT_PATH / f"models/{MODEL_NAME.rpartition('/')[-1]}"

training_args = TrainingArguments(
    output_dir=model_path,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'  # Disabling wandb callbacks
)

def compute_metrics(pred):
    y_pred = np.argmax(pred.predictions, axis=1)
    return {'accuracy': accuracy_score(pred.label_ids, y_pred)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

### Training

In [None]:
trainer.train()

Save the best model and the tokenizer to disk.

In [None]:
# Save the best model
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

### Evaluation & Visualization

Compute accuracy on the test set.

In [None]:
pred = trainer.predict(test_set)
y_true = test_set['label']
y_pred = np.argmax(pred.predictions, axis=-1)
accuracy = accuracy_score(y_true, y_pred)
print(f"\nAccuracy: {accuracy:.3f}")

Display the confusion matrix using `seaborn`.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred),
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=['Entailment', 'Neutral', 'Contradiction'],
            yticklabels=['Entailment', 'Neutral', 'Contradiction'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Entailment', 'Neutral', 'Contradiction']))