# Contradictory, My Dear Watson

This project implements a Natural Language Inference (NLI) classifier using XLM-RoBERTa to determine if two given sentences are entailment, neutral, or contradictory. The model is fine-tuned on the "Contradictory, My Dear Watson" dataset and deployed as a user-friendly interface using Gradio.

Dataset: https://www.kaggle.com/competitions/contradictory-my-dear-watson/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/NLI-Classifier

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import pandas as pd
import os
from datetime import datetime

# Create timestamp for unique model naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_save_path = f'/kaggle/working/nli_model_{timestamp}'
os.makedirs(model_save_path, exist_ok=True)

# Load data and model
data = load_dataset('csv', data_files={
    'train': '/kaggle/input/contradictory-my-dear-watson/train.csv',
    'test': '/kaggle/input/contradictory-my-dear-watson/test.csv'
})
train_val = data['train'].train_test_split(0.2)

# Use xlm-roberta-base
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Preprocessing with labels
def tokenize(examples, is_test=False):
    tokenized = tokenizer(
        examples['premise'],
        examples['hypothesis'],
        truncation=True,
        max_length=64,
        padding='max_length'
    )
    if not is_test:
        tokenized['labels'] = examples['label']
    return tokenized

# Process datasets
train_dataset = train_val['train'].map(
    lambda x: tokenize(x, is_test=False), 
    batched=True, 
    remove_columns=['premise', 'hypothesis', 'id', 'lang_abv', 'language']
)
val_dataset = train_val['test'].map(
    lambda x: tokenize(x, is_test=False), 
    batched=True,
    remove_columns=['premise', 'hypothesis', 'id', 'lang_abv', 'language']
)
test_dataset = data['test'].map(
    lambda x: tokenize(x, is_test=True), 
    batched=True,
    remove_columns=['premise', 'hypothesis', 'id', 'lang_abv', 'language']
)

# Training configuration
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=model_save_path,
        per_device_train_batch_size=32,
        num_train_epochs=2,
        learning_rate=2e-5,
        fp16=True,
        report_to=[],
        save_strategy='epoch',     # Save at each epoch
        evaluation_strategy='no',
        save_total_limit=1,        # Keep only the last model
    ),
    train_dataset=train_dataset,
)

# Train and predict
trainer.train()

# Save the final model and tokenizer
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save model config
config_dict = {
    'model_name': model_name,
    'max_length': 64,
    'num_labels': 3,
    'training_timestamp': timestamp
}
import json
with open(f'{model_save_path}/config.json', 'w') as f:
    json.dump(config_dict, f)

# Make predictions
preds = trainer.predict(test_dataset)

# Save predictions
pd.DataFrame({
    'id': data['test']['id'],
    'prediction': np.argmax(preds.predictions, axis=1)
}).to_csv(f'{model_save_path}/submission.csv', index=False)

print(f"Model saved to: {model_save_path}")

# Optional: Save training metrics
if hasattr(trainer, 'state'):
    training_stats = {
        'train_loss': trainer.state.log_history,
        'best_model_checkpoint': trainer.state.best_model_checkpoint
    }
    with open(f'{model_save_path}/training_stats.json', 'w') as f:
        json.dump(training_stats, f)