# Natural Language Inference

This Jupyter Notebook fine-tunes `microsoft/mdeberta-v3-base` on the `Contradictory, My Dear Watson` dataset.
It includes data loading, preprocessing, data augmentation, model training, evaluation, and visualization.

### Environment Setup

This notebook was executed on a local Jupyter server in a LXC connected to 4 GPU: A100 MXP 80GB to train the model.

#### Install dependencies

In [None]:
%pip install -qU pandas numpy seaborn matplotlib scikit-learn datasets nltk nlpaug
%pip install -qU "transformers[torch]"

#### Import libraries

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, concatenate_datasets, load_dataset

# Data augmentation
from nlpaug.augmenter.word import RandomWordAug, SynonymAug
from nlpaug.augmenter.char import KeyboardAug

In [None]:
ROOT_PATH = Path.cwd()
# Use fixed seed for results reproducibility
np.random.seed(0)

### Data Loading

#### Challenge dataset

In [None]:
df = pd.read_csv(ROOT_PATH / "data/train.csv")
df.drop(columns=["id"], inplace=True)

# Train-test spliting
challenge_dataset = Dataset.from_pandas(df).train_test_split(test_size=0.3, seed=0)
challenge_train = challenge_dataset['train']
challenge_val = challenge_dataset['test']
# Inspect the dataset with 10 random row
df.sample(10)

Display label distribution: the dataset is evenly distributed across all labels.

In [None]:
df.label.value_counts().plot(kind="bar", figsize=(10, 6), title="Pair-wise sentences distribution")
plt.show()

However, the dataset predominantly consists of English sentences.

In [None]:
df.language.value_counts().plot(
    kind="pie", 
    figsize=(10, 10), 
    autopct='%1.1f%%', 
    title="Languages distribution"
)
plt.show()

#### MNLI dataset

We use `matched` version of test and train set because the dataset from the Challenge seems to match the same genre.

In [None]:
mnli_train = load_dataset("nyu-mll/glue", "mnli", split="train")
mnli_val = load_dataset("nyu-mll/glue", "mnli", split="validation_matched")

# Inspect the dataset
mnli_train_df = mnli_train.to_pandas()
mnli_train_df.head()

#### XNLI dataset

[XNLI: Evaluating Cross-lingual Sentence Representations](https://aclanthology.org/D18-1269/) (Conneau et al., EMNLP 2018)

In [None]:
def load_xnli_datasets(languages: list, split="train"):
    datasets = []
    for lang in languages:
        xnli_lang = load_dataset("facebook/xnli", lang, split=split)
        xnli_lang = xnli_lang.add_column("lang_abv", [lang] * len(xnli_lang))
        datasets.append(xnli_lang)

    return concatenate_datasets(datasets).shuffle(seed=0)

languages = ['ar','bg','de','el','en','es','fr','hi','ru','sw','th','tr','ur','vi','zh']
xnli_train = load_xnli_datasets(languages, split="train")
xnli_val = load_xnli_datasets(languages, split="validation")

Now, we inspect the XNLI dataset.

In [None]:
# xnli_train_df = xnli_train.to_pandas()
# xnli_train_df.head()

### Data Augmentation

Use `NLPAug` library to augment the data by using synonyms, typo insertion and word swapping.

In [None]:
def augment_text(text, augmenter):
    try:
        result = augmenter.augment(text)
        # Handle list outputs from some augmenters
        return result[0] if isinstance(result, list) else str(result)
    except Exception as e:
        return str(text)  # Ensure string return

def augment_df(df, augmenters, sample_frac=0.5):
    sample = df.sample(frac=sample_frac)
    augmented = []

    for _, row in tqdm(sample.iterrows(), total=len(sample)):
        for aug, weight in augmenters:
            if np.random.random() > weight:
                continue  # Skip this augmentation

            new_row = row.copy()
            premise = new_row['premise'] = augment_text(row['premise'], aug)
            hypothesis = new_row['hypothesis'] = augment_text(row['hypothesis'], aug)
            if premise != row["premise"] or hypothesis != row["hypothesis"]:
                augmented.append(new_row)

    return pd.DataFrame(augmented).convert_dtypes()

Define augmentation strategies and their probability to be used for each sentence.

In [None]:
augmenters = [
    (SynonymAug(aug_src='wordnet', aug_p=0.1), 0.8),   # Synonym replacement
    (RandomWordAug(action='swap', aug_p=0.1), 0.1),    # Word swapping
    (KeyboardAug(aug_char_p=0.1, aug_word_p=0.1), 0.3) # Typo simulation
]

df_aug = augment_df(df, augmenters)
print(f"Adding {len(df_aug)} new examples")

# Concatenate augmented dataset with original
challenge_aug = Dataset.from_pandas(df_aug)
# Uncomment this line to add augmented data to the training set
# challenge_train = concatenate_datasets([challenge_train, challenge_aug]).shuffle()

### MNLI Finetuning

Finetune the model on the MNLI dataset.

**Architecture:**  
- `FacebookAI/xlm-roberta-large` transformer from HuggingFace
- Classification head with dropout (0.3)

Train using the `Trainer` API.

#### Load the pre-trained model

In [None]:
# Define constants
PRETRAINED_MODEL_NAME = "FacebookAI/xlm-roberta-large"
MODEL_DIR = ROOT_PATH / "models"
MODEL_BASENAME = PRETRAINED_MODEL_NAME.rpartition('/')[2]

# Pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=3)
model.classifier.dropout = torch.nn.Dropout(0.3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available

def compute_metrics(eval_pred):
    y_pred = np.argmax(eval_pred.predictions, axis=1)
    return dict(accuracy=accuracy_score(eval_pred.label_ids, y_pred))

#### Load the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def tokenize_function(row):
    return tokenizer(row['premise'], row['hypothesis'], padding='longest')

#### Set training hyperparameters

In [None]:
mnli_model_path = MODEL_DIR / f"{MODEL_BASENAME}-mnli"

training_args = TrainingArguments(
    output_dir=mnli_model_path,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'  # Disabling wandb callbacks
)

mnli_train_tokenized = mnli_train.map(tokenize_function, batched=True)
mnli_val_tokenized = mnli_val.map(tokenize_function, batched=True)

mnli_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mnli_train_tokenized,
    eval_dataset=mnli_val_tokenized,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

#### Training

In [None]:
mnli_trainer.train()

# Save the best model and the tokenizer to disk.
mnli_trainer.save_model(mnli_model_path)
tokenizer.save_pretrained(mnli_model_path)

### XNLI Finetuning

Finetune the model on the XNLI dataset.

**Architecture:**  
- `xml-roberta-large-mnli` fine-tuned model on MNLI dataset
- Classification head with dropout (0.3)

Train using `Trainer` with same hyperparameters and evaluate the model on the Challenge (train) dataset.

#### Load the fine-tuned model

In [None]:
mnli_model = AutoModelForSequenceClassification.from_pretrained(mnli_model_path, num_labels=3)
mnli_model.classifier.dropout = torch.nn.Dropout(0.3)

#### Set training hyperparameters

In [None]:
xnli_model_path = MODEL_DIR / f"{MODEL_BASENAME}-mnli-xnli"

training_args = TrainingArguments(
    output_dir=xnli_model_path,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none'  # Disabling wandb callbacks
)

xnli_train_tokenized = xnli_train.map(tokenize_function, batched=True)
challenge_tokenized = challenge_dataset.map(tokenize_function, batched=True)

xnli_trainer = Trainer(
    model=mnli_model,
    args=training_args,
    train_dataset=xnli_train_tokenized,
    eval_dataset=challenge_tokenized,  # Use Challenge dataset set for evaluation
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

#### Training

In [None]:
xnli_trainer.train()

# Save the best model and the tokenizer to disk.
xnli_trainer.save_model(xnli_model_path)
tokenizer.save_pretrained(xnli_model_path)

#### Upload to HuggingFace Hub

In [None]:
xnli_model = AutoModelForSequenceClassification.from_pretrained(xnli_model_path, num_labels=3)

# Push the model to the hub
xnli_model.push_to_hub(
    f"ajayat/{MODEL_BASENAME}-mnli-xnli",
    private=True,
    use_auth_token=True
)

### Evaluation & Visualization

Compute accuracy on the Challenge validation set.

In [None]:
pred = xnli_trainer.predict(challenge_dataset)
y_true = challenge_dataset['label']
y_pred = np.argmax(pred.predictions, axis=-1)

accuracy = accuracy_score(y_true, y_pred)
print(f"\nAccuracy: {accuracy:.3f}")

Display the confusion matrix using `seaborn`.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred),
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=['Entailment', 'Neutral', 'Contradiction'],
            yticklabels=['Entailment', 'Neutral', 'Contradiction'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Entailment', 'Neutral', 'Contradiction']))