In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments, IntervalStrategy
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


# 1. Load processed data

In [2]:
print("Loading processed data...")
train_df = pd.read_csv('../data/processed/intent_train.csv')
val_df = pd.read_csv('../data/processed/intent_val.csv')
test_df = pd.read_csv('../data/processed/intent_test.csv')

Loading processed data...


# 2. Create PyTorch Dataset

In [3]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 3. Prepare data

In [4]:
print("\nPreparing data for model...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create label encoder
unique_labels = sorted(train_df['intent'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert labels to IDs
train_labels = [label2id[label] for label in train_df['intent']]
val_labels = [label2id[label] for label in val_df['intent']]
test_labels = [label2id[label] for label in test_df['intent']]

# Create datasets
train_dataset = IntentDataset(train_df['text'].tolist(), train_labels, tokenizer)
val_dataset = IntentDataset(val_df['text'].tolist(), val_labels, tokenizer)
test_dataset = IntentDataset(test_df['text'].tolist(), test_labels, tokenizer)


Preparing data for model...


# 4. Initialize model

In [5]:
print("\nInitializing model...")
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)


Initializing model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 5. Training arguments

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",  # Correct argument name for v4.52.4
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_first_step=True,
    disable_tqdm=False
)


# 6. Initialize trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Print training configuration
print("\nTraining configuration:")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print(f"Test examples: {len(test_dataset)}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Number of epochs: {training_args.num_train_epochs}")

# 7. Train model

In [8]:
print("\nTraining model...")
trainer.train()


Training model...


Step,Training Loss,Validation Loss
100,2.2655,2.239313
200,1.4566,1.353743
300,0.8299,0.852765
400,0.6924,0.719674


KeyboardInterrupt: 

# 8. Evaluate model

In [None]:
print("\nEvaluating model...")
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Print classification report
print("\nClassification Report:")
print(classification_report(test_labels, pred_labels, target_names=unique_labels))

# Plot confusion matrix
plt.figure(figsize=(20, 20))
cm = confusion_matrix(test_labels, pred_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# 9. Save model

In [None]:
print("\nSaving model...")
model.save_pretrained('./models/intent_classifier')
tokenizer.save_pretrained('./models/intent_classifier')