In [None]:
import pandas as pd
import random
import numpy as np
import torch

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [47]:
df_tags = pd.read_csv("Cleaned_Tickets.csv")

In [48]:
# Combine subject + body into one 'text' column
df_tags['text'] = df_tags['subject'].str.strip() + " " + df_tags['body'].str.strip()

In [49]:
# Combine the three tag columns into a Python list
df_tags['tags_list'] = df_tags[['tag_1','tag_2','tag_3']].values.tolist()

In [50]:
# Gather every tag in the corpus
all_lists = df_tags['tags_list'].tolist()
full_tags = sorted({t for tags in all_lists for t in tags})

In [51]:
# Instantiate the binarizer with that fixed class list
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=full_tags)

In [52]:
# Register the classes (no leakage: only registers names, not counts)
mlb.fit(all_lists)

0,1,2
,classes,"['AI', 'API', ...]"
,sparse_output,False


In [53]:
from sklearn.model_selection import train_test_split

In [54]:
texts = df_tags['text'].tolist()
tag_lists = df_tags['tags_list'].tolist()

In [55]:
# Hold out 10% for final testing
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, tag_lists,
    test_size=0.10,
    random_state=42,
    shuffle=True
)

# From the 90% remainder, hold out 10% for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.10,
    random_state=42,
    shuffle=True
)

In [56]:
# Transform train/val/test lists into binary matrices
y_train = mlb.transform(y_train)
y_val   = mlb.transform(y_val)
y_test  = mlb.transform(y_test)


In [57]:
from transformers import BertTokenizer

In [58]:
# Load the cased tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [59]:
# Tokenization helper (batch)
def tokenize_batch(texts, max_length=256):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    )

In [60]:
batch = tokenize_batch(X_train[:4])

In [None]:
from torch.utils.data import Dataset, DataLoader

class TagDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [62]:
# Instantiate datasets
train_ds = TagDataset(X_train, y_train, tokenizer)
val_ds   = TagDataset(X_val,   y_val,   tokenizer)
test_ds  = TagDataset(X_test,  y_test,  tokenizer)

In [63]:
# DataLoaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(test_ds,  batch_size=64)

In [64]:
import torch.nn as nn

class SubwordAvgClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_tags):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc    = nn.Linear(embed_dim, num_tags)

    def forward(self, input_ids, attention_mask):
        x = self.embed(input_ids)                           # [B, L, D]
        x = x * attention_mask.unsqueeze(-1)                # mask padding
        summed = x.sum(dim=1)                               # [B, D]
        counts = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg    = summed / counts                            # [B, D]
        return self.fc(avg)                                 # [B, num_tags]

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SubwordAvgClassifier(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    num_tags=len(mlb.classes_)
).to(device)


In [None]:
from torch.optim import AdamW
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# EarlyStopping helper
class EarlyStopping:
    def __init__(self, patience=3, mode="max", delta=0.0):
        self.patience = patience
        self.mode = mode
        self.delta = delta
        self.best = None
        self.bad_epochs = 0
        self.should_stop = False

    def step(self, metric):
        if self.best is None:
            self.best = metric
            return
        improved = (metric > self.best + self.delta) if self.mode == "max" \
                   else (metric < self.best - self.delta)
        if improved:
            self.best = metric
            self.bad_epochs = 0
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                self.should_stop = True

# Optimizer & loss
optimizer = AdamW(model.parameters(), lr=1e-3)
loss_fn   = nn.BCEWithLogitsLoss()

# Training epoch
def train_epoch():
    model.train()
    total_loss, n_batches = 0.0, 0
    for batch in train_loader:
        ids, mask, labs = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["labels"].to(device)
        )
        optimizer.zero_grad()
        logits = model(ids, mask)
        loss   = loss_fn(logits, labs)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        n_batches  += 1

    return total_loss / n_batches

# Evaluation on any loader
def eval_loader(loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss, n_batches = 0.0, 0
    with torch.no_grad():
        for batch in loader:
            ids, mask, labs = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["labels"].to(device)
            )
            logits = model(ids, mask)
            loss   = loss_fn(logits, labs)
            total_loss += loss.item()
            n_batches  += 1

            preds = (logits.cpu().numpy() > 0).astype(int)
            all_labels.append(labs.cpu().numpy())
            all_preds.append(preds)

    avg_loss = total_loss / n_batches
    y_true   = np.vstack(all_labels)
    y_pred   = np.vstack(all_preds)
    return {
        "loss":      avg_loss,
        "precision": precision_score(y_true, y_pred, average="micro"),
        "recall":    recall_score(y_true, y_pred, average="micro"),
        "f1":        f1_score(y_true, y_pred, average="micro"),
    }


In [None]:
best_f1 = -float("inf")
checkpoint_path = "best_model.pt"
early_stopper = EarlyStopping(patience=2, mode="max", delta=1e-4)

for epoch in range(1, 50):
    train_loss = train_epoch()
    results = eval_loader(val_loader)
    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {results['loss']:.4f} | "
        f"F1: {results['f1']:.4f} | "
        f"Prec: {results['precision']:.4f} | "
        f"Recall: {results['recall']:.4f}"
    )
    early_stopper.step(results["f1"])
    if early_stopper.should_stop:
        print(f"Stopping early at epoch {epoch}.")
        break


Epoch 1 → Val Loss: 0.0277  F1: 0.0884  Prec: 0.8306  Recall: 0.0467
Epoch 2 → Val Loss: 0.0227  F1: 0.2463  Prec: 0.8015  Recall: 0.1455
Epoch 3 → Val Loss: 0.0205  F1: 0.3643  Prec: 0.8025  Recall: 0.2357
Epoch 4 → Val Loss: 0.0190  F1: 0.4571  Prec: 0.7706  Recall: 0.3249
Epoch 5 → Val Loss: 0.0179  F1: 0.5110  Prec: 0.7893  Recall: 0.3777
Epoch 6 → Val Loss: 0.0170  F1: 0.5461  Prec: 0.7935  Recall: 0.4163
Epoch 7 → Val Loss: 0.0163  F1: 0.5773  Prec: 0.8006  Recall: 0.4514
Epoch 8 → Val Loss: 0.0158  F1: 0.6013  Prec: 0.7906  Recall: 0.4852
Epoch 9 → Val Loss: 0.0153  F1: 0.6128  Prec: 0.8015  Recall: 0.4960
Epoch 10 → Val Loss: 0.0149  F1: 0.6257  Prec: 0.8136  Recall: 0.5083
Epoch 11 → Val Loss: 0.0146  F1: 0.6354  Prec: 0.8093  Recall: 0.5230
Epoch 12 → Val Loss: 0.0143  F1: 0.6457  Prec: 0.8007  Recall: 0.5409
Epoch 13 → Val Loss: 0.0140  F1: 0.6472  Prec: 0.8179  Recall: 0.5355
Epoch 14 → Val Loss: 0.0138  F1: 0.6568  Prec: 0.8092  Recall: 0.5527
Epoch 15 → Val Loss: 0.0136  

In [None]:
#  Final evaluation on the test set
test_metrics = eval_loader(test_loader)
print("Test set metrics:", test_metrics)

Test set metrics: {'loss': 0.011853707022964954, 'precision': 0.8210441334768568, 'recall': 0.6226530612244898, 'f1': 0.708217270194986}
