# Intent Classification (Colab)

Use JSON splits stored in Google Drive. Defaults now target your `Colab Notebooks` folder (`/content/drive/MyDrive/Colab Notebooks`) where your `train.json`, `val.json`, and `test.json` live. Update the paths cell if your layout differs, then run top-to-bottom.

- Required files: `train.json`, `val.json`, `test.json` in the selected folder.

- Outputs: `best_model.pt`, `test_results.json`, `training_curves.png` saved to `SAVE_DIR`.


In [None]:
# Check GPU availability
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'✓ Using device: {device}')
if torch.cuda.is_available():
    print(f'  GPU: {torch.cuda.get_device_name(0)}')
    print(f'  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
else:
    print('⚠️ No GPU detected; training will be slow.')

In [None]:
# Install dependencies (Colab)
!pip install -q torch transformers scikit-learn tqdm matplotlib

In [None]:
# Imports
import json
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import DistilBertModel, DistilBertTokenizer, get_linear_schedule_with_warmup

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print('✓ Imports loaded')

In [None]:
# Mount Google Drive (run once per session)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Paths: set to where you stored train/val/test JSONs in Drive

# Try common locations under MyDrive (prefers Colab Notebooks)
candidates = [
    Path('/content/drive/MyDrive/Colab Notebooks'),
    Path('/content/drive/MyDrive/Colab Notebook'),  # fallback in case of typo
    Path('/content/drive/MyDrive/data/intent_classification/splits'),
    Path('/content/drive/MyDrive/intent_data/splits')
]

DATA_DIR = None
for cand in candidates:
    required = {split: cand / f"{split}.json" for split in ["train", "val", "test"]}
    missing = [p for p in required.values() if not p.exists()]
    if not missing:
        DATA_DIR = cand
        break

# If not found, raise with guidance
if DATA_DIR is None:
    checks = []
    for cand in candidates:
        required = {split: cand / f"{split}.json" for split in ["train", "val", "test"]}
        missing = [p.name for p in required.values() if not p.exists()]
        checks.append(f"- {cand}: missing {missing}")
    raise FileNotFoundError("Could not find train/val/test.json. Checked:\n" + "\n".join(checks))

SAVE_DIR = Path('/content/drive/MyDrive/intent_experiments/intent_classification_run1')  # change if you prefer a different output location

SAVE_DIR.mkdir(parents=True, exist_ok=True)
print(f"✓ Data dir: {DATA_DIR}")
print(f"✓ Save dir: {SAVE_DIR}")

In [None]:
# Config
CONFIG = {
    'model_name': 'distilbert-base-uncased',
    'num_labels': 2,
    'max_length': 128,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'epochs': 4,
    'warmup_steps': 0,
    'weight_decay': 0.01,
    'dropout': 0.1,
    'random_seed': 42,
    'data_base': str(DATA_DIR),
    'save_dir': str(SAVE_DIR)
}

LABEL_MAP = {'in_context': 0, 'out_of_context': 1}
LABEL_NAMES = ['in_context', 'out_of_context']

torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

for k, v in CONFIG.items():
    print(f'{k}: {v}')

In [None]:
# Dataset
class IntentDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_length=128):
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        self.samples = data['samples']
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = LABEL_MAP

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        encoding = self.tokenizer(
            sample['text'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.label_map[sample['intent']])
        }

print('✓ Dataset class ready')

In [None]:
# Tokenizer + datasets/dataloaders
tokenizer = DistilBertTokenizer.from_pretrained(CONFIG['model_name'])

train_dataset = IntentDataset(DATA_DIR / 'train.json', tokenizer, CONFIG['max_length'])
val_dataset = IntentDataset(DATA_DIR / 'val.json', tokenizer, CONFIG['max_length'])
test_dataset = IntentDataset(DATA_DIR / 'test.json', tokenizer, CONFIG['max_length'])

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f'Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}')

In [None]:
# Model
class DistilBertIntentClassifier(nn.Module):
    def __init__(self, num_labels=2, model_name='distilbert-base-uncased', dropout=0.1):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained(model_name)
        hidden_size = self.distilbert.config.hidden_size
        self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_size, num_labels))
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled)
        loss = self.loss_fn(logits, labels) if labels is not None else None
        from types import SimpleNamespace
        return SimpleNamespace(loss=loss, logits=logits)

model = DistilBertIntentClassifier(
    num_labels=CONFIG['num_labels'],
    model_name=CONFIG['model_name'],
    dropout=CONFIG['dropout']
)
model = model.to(device)
print('✓ Model ready on', device)

In [None]:
# Optimizer + scheduler
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
total_steps = len(train_loader) * CONFIG['epochs']
warmup_steps = int(0.1 * total_steps) if CONFIG['warmup_steps'] == 0 else CONFIG['warmup_steps']
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
print(f'Steps: {total_steps}, warmup: {warmup_steps}')

In [None]:
# Train/eval helpers
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for batch in tqdm(dataloader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return total_loss / len(dataloader), correct / total

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask, labels)
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    acc = accuracy_score(all_labels, all_preds)
    return avg_loss, acc, all_preds, all_labels

print('✓ Training helpers ready')

In [None]:
# Train loop

history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

best_val_acc, best_epoch = 0, 0



print(f"Training for {CONFIG['epochs']} epochs...")

for epoch in range(CONFIG['epochs']):

    print(f"\nEpoch {epoch+1}/{CONFIG['epochs']}")

    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)

    val_loss, val_acc, _, _ = evaluate(model, val_loader, device)

    history['train_loss'].append(train_loss)

    history['train_acc'].append(train_acc)

    history['val_loss'].append(val_loss)

    history['val_acc'].append(val_acc)

    print(f"Train loss {train_loss:.4f} acc {train_acc*100:.2f}% | Val loss {val_loss:.4f} acc {val_acc*100:.2f}%")

    if val_acc > best_val_acc:

        best_val_acc, best_epoch = val_acc, epoch + 1

        ckpt = {

            'epoch': epoch,

            'model_state_dict': model.state_dict(),

            'optimizer_state_dict': optimizer.state_dict(),

            'val_acc': val_acc,

            'config': CONFIG

        }

        torch.save(ckpt, SAVE_DIR / 'best_model.pt')

        print(f"✓ Saved new best (val acc {val_acc*100:.2f}%)")



print(f"Best val acc: {best_val_acc*100:.2f}% (epoch {best_epoch})")


In [None]:
# Training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
epochs_range = range(1, CONFIG['epochs'] + 1)
ax1.plot(epochs_range, history['train_loss'], 'b-o', label='Train')
ax1.plot(epochs_range, history['val_loss'], 'r-o', label='Val')
ax1.set_title('Loss')
ax1.legend(); ax1.grid(True, alpha=0.3)
ax2.plot(epochs_range, [a*100 for a in history['train_acc']], 'b-o', label='Train')
ax2.plot(epochs_range, [a*100 for a in history['val_acc']], 'r-o', label='Val')
ax2.set_title('Accuracy (%)')
ax2.legend(); ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(SAVE_DIR / 'training_curves.png', dpi=150, bbox_inches='tight')
plt.show()
print('✓ Curves saved to', SAVE_DIR / 'training_curves.png')

In [None]:
# Test evaluation

checkpoint = torch.load(SAVE_DIR / 'best_model.pt', map_location=device)

model.load_state_dict(checkpoint['model_state_dict'])

test_loss, test_acc, test_preds, test_labels = evaluate(model, test_loader, device)

print(f"Test loss: {test_loss:.4f} | Test acc: {test_acc*100:.2f}%")



print("\nClassification report:")

print(classification_report(test_labels, test_preds, target_names=LABEL_NAMES, digits=4))



cm = confusion_matrix(test_labels, test_preds)

print("\nConfusion matrix:")

print(cm)


In [None]:
# Save test results
test_results = {
    'test_accuracy': float(test_acc),
    'test_loss': float(test_loss),
    'num_samples': len(test_labels),
    'predictions': [int(p) for p in test_preds],
    'true_labels': [int(l) for l in test_labels],
    'classification_report': classification_report(test_labels, test_preds, target_names=LABEL_NAMES, output_dict=True),
    'confusion_matrix': cm.tolist(),
    'config': CONFIG
}
with open(SAVE_DIR / 'test_results.json', 'w') as f:
    json.dump(test_results, f, indent=2)
print('✓ Saved test_results.json to', SAVE_DIR)

In [None]:
# Sample predictions
def predict_intent(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(text, max_length=CONFIG['max_length'], padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        conf = probs[0][pred].item()
    return LABEL_NAMES[pred], conf

samples = [
    'I want a black jacket',
    'What is the weather today?',
    'Show me red dresses',
    'Tell me a joke',
    'Do you have size medium?',
    'How do I cook pasta?'
]
for s in samples:
    intent, conf = predict_intent(s, model, tokenizer, device)
    print(f'[{intent}] conf={conf:.3f} | {s}')