In [1]:
!pip install transformers scikit-learn

import os, json, random, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm.auto import tqdm



In [43]:
# -----------------------
# Config
# -----------------------
SEED = 42
MODEL_NAME = "distilbert-base-uncased"
DATA_PATH  = "Cleaned_Tickets.csv"
MAX_LEN    = 256
BATCH_SIZE = 32
EPOCHS     = 4
LR_HEAD    = 2e-4
PATIENCE   = 1
SAVE_DIR   = "./type_base_model_pt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

In [5]:
# -----------------------
# Data
# -----------------------
df = pd.read_csv(DATA_PATH)
df["subject"] = df["subject"].fillna("")
df["body"]    = df["body"].fillna("")
df["text"]    = (df["subject"] + " " + df["body"]).str.strip()

le = LabelEncoder()
df["label"] = le.fit_transform(df["type"])
problem_idx = np.where(le.classes_ == "Problem")[0][0]  # index of Problem class

train_df, val_df = train_test_split(
    df[["text","label"]],
    test_size=0.2, random_state=SEED, stratify=df["label"]
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TicketDS(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True, max_length=self.max_len, padding=False,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = TicketDS(train_df["text"], train_df["label"], tokenizer, MAX_LEN)
val_ds   = TicketDS(val_df["text"],   val_df["label"],   tokenizer, MAX_LEN)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collator)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)




In [7]:
# -----------------------
# Model: DistilBERT (frozen) + linear head, MEAN POOLING
# -----------------------
class DistilBertClassifier(nn.Module):
    def __init__(self, model_name, num_labels, dropout=0.2, freeze_encoder=True):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        if freeze_encoder:
            for p in self.bert.parameters():
                p.requires_grad = False
        hidden = self.bert.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden = out.last_hidden_state                 # [B, T, H]
        mask = attention_mask.unsqueeze(-1)            # [B, T, 1]
        summed = (hidden * mask).sum(dim=1)            # [B, H]
        counts = mask.sum(dim=1).clamp(min=1e-9)       # [B, 1]
        mean_pooled = summed / counts                  # [B, H]
        x = self.dropout(mean_pooled)
        return self.classifier(x)

num_labels = len(le.classes_)
model = DistilBertClassifier(MODEL_NAME, num_labels, freeze_encoder=True).to(device)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [9]:
# -----------------------
# Class-weighted loss (softened + Problem boost)
# -----------------------
train_labels_np = train_df["label"].to_numpy()
raw_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_labels),
    y=train_labels_np
).astype(np.float32)
soft_weights = 0.7 * raw_weights + 0.3

In [11]:
# boost Problem class by +15%
soft_weights[problem_idx] *= 1.15

weights_tensor = torch.tensor(soft_weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR_HEAD)


In [13]:
# -----------------------
# Train / Eval helpers
# -----------------------
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    losses, preds_all, labels_all = [], [], []
    for batch in tqdm(loader, leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.set_grad_enabled(train):
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        preds_all.append(preds)
        labels_all.append(labels.detach().cpu().numpy())

    preds_all = np.concatenate(preds_all)
    labels_all = np.concatenate(labels_all)
    f1w = f1_score(labels_all, preds_all, average="weighted")
    return np.mean(losses), f1w, preds_all, labels_all

In [15]:
# -----------------------
# Training loop (frozen only)
# -----------------------
best_f1, best_state, patience = -1.0, None, PATIENCE
for epoch in range(1, EPOCHS+1):
    tr_loss, tr_f1, _ , _  = run_epoch(train_loader, train=True)
    va_loss, va_f1, va_pred, va_true = run_epoch(val_loader, train=False)
    print(f"[Frozen+MeanPool+ProblemBoost] Epoch {epoch}/{EPOCHS} | train loss {tr_loss:.4f} f1w {tr_f1:.4f} | val loss {va_loss:.4f} f1w {va_f1:.4f}")

    if va_f1 > best_f1:
        best_f1 = va_f1
        best_state = {
            "model": model.state_dict(),
            "label_classes": le.classes_,
            "tokenizer_name": MODEL_NAME,
            "max_len": MAX_LEN
        }
        patience = PATIENCE
    else:
        patience -= 1
        if patience < 0:
            print("Early stopping.")
            break

  0%|          | 0/409 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/103 [00:00<?, ?it/s]

[Frozen+MeanPool+ProblemBoost] Epoch 1/4 | train loss 1.0532 f1w 0.6317 | val loss 0.8431 f1w 0.6964


  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

[Frozen+MeanPool+ProblemBoost] Epoch 2/4 | train loss 0.7733 f1w 0.7119 | val loss 0.6925 f1w 0.7301


  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

[Frozen+MeanPool+ProblemBoost] Epoch 3/4 | train loss 0.6680 f1w 0.7365 | val loss 0.6172 f1w 0.7438


  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

[Frozen+MeanPool+ProblemBoost] Epoch 4/4 | train loss 0.6114 f1w 0.7447 | val loss 0.5715 f1w 0.7574


In [45]:
# -----------------------
# SAVE — tensors only + JSON metadata
# -----------------------
os.makedirs(SAVE_DIR, exist_ok=True)
torch.save(best_state["model"], os.path.join(SAVE_DIR, "base_model_weights.pt"))
tokenizer.save_pretrained(SAVE_DIR)
meta = {
    "label_classes": best_state["label_classes"].tolist(),
    "tokenizer_name": best_state["tokenizer_name"],
    "max_len": int(best_state["max_len"])
}
with open(os.path.join(SAVE_DIR, "metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)
print("Saved model weights + tokenizer + metadata to:", SAVE_DIR)

Saved model weights + tokenizer + metadata to: ./type_base_model_pt


In [19]:
# -----------------------
# Final report (reload + evaluate)
# -----------------------
state_dict = torch.load(os.path.join(SAVE_DIR, "model_state_dict.pt"), map_location=device)
reloaded = DistilBertClassifier(meta["tokenizer_name"], num_labels=len(meta["label_classes"]), freeze_encoder=True).to(device)
reloaded.load_state_dict(state_dict)
reloaded.eval()

def eval_with(model_to_eval):
    _, _, va_pred, va_true = run_epoch(val_loader, train=False)
    print(classification_report(va_true, va_pred, target_names=le.classes_))

eval_with(reloaded)



  0%|          | 0/103 [00:00<?, ?it/s]

              precision    recall  f1-score   support

      Change       0.81      0.93      0.87       341
    Incident       0.73      0.74      0.74      1314
     Problem       0.49      0.46      0.47       680
     Request       0.97      0.94      0.95       933

    accuracy                           0.76      3268
   macro avg       0.75      0.77      0.76      3268
weighted avg       0.76      0.76      0.76      3268



In [21]:
# -----------------------
# Loader + inference helpers
# -----------------------
import torch.nn.functional as F

def load_type_model(save_dir=SAVE_DIR, device=device):
    with open(os.path.join(save_dir, "metadata.json")) as f:
        meta = json.load(f)
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    model = DistilBertClassifier(meta["tokenizer_name"], num_labels=len(meta["label_classes"]), freeze_encoder=True)
    state_dict = torch.load(os.path.join(save_dir, "model_state_dict.pt"), map_location=device)
    model.load_state_dict(state_dict)
    model.to(device).eval()
    le_loaded = LabelEncoder(); le_loaded.fit(meta["label_classes"])
    return model, tokenizer, le_loaded, meta

def predict_types(texts, model, tokenizer, label_encoder, max_len, device=device):
    enc = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(enc["input_ids"], enc["attention_mask"])
        probs = F.softmax(logits, dim=-1).cpu().numpy()
    idx = probs.argmax(axis=1)
    return label_encoder.inverse_transform(idx), probs

In [23]:
# Smoke test
mdl, tok, le_loaded, meta_loaded = load_type_model(SAVE_DIR, device=device)
labels, _ = predict_types(
    ["Please reset my password", "Payments page throws 500 error", "Planned maintenance tonight 22:00"],
    mdl, tok, le_loaded, meta_loaded["max_len"], device=device
)
print(labels)



['Problem' 'Problem' 'Incident']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Run evaluation to get predictions
_, _, va_pred, va_true = run_epoch(val_loader, train=False)

# Compute confusion matrix
cm = confusion_matrix(va_true, va_pred)

# Plot
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix — TYPE classification")
plt.tight_layout()
plt.show()


  0%|          | 0/103 [00:00<?, ?it/s]