In [134]:
import random
import pandas as pd
import numpy as np
from collections import Counter
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import BertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [135]:
# Hyperparameters
dataset_path       = "Cleaned_Tickets.csv"
seed                = 42
batch_size          = 32
val_batch_size      = 64
test_batch_size     = 64
max_length          = 256
embed_dim           = 128
hidden_dim          = 256
learning_rate       = 1e-3
patience            = 2
num_epochs          = 50
threshold_grid      = np.linspace(0.1, 0.9, 17)

In [136]:
# Device & seeds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [137]:
# Data loading & basic inspection
df = pd.read_csv(dataset_path)
print(f"Loaded {df.shape[0]} tickets with {df.shape[1]} columns")
print(df[['subject','body','tag_1','tag_2','tag_3']].isnull().sum(), "\n")

Loaded 16337 tickets with 9 columns
subject    0
body       0
tag_1      0
tag_2      0
tag_3      0
dtype: int64 



In [138]:
# Text & tag preprocessing
df['text'] = df['subject'].str.strip() + " " + df['body'].str.strip()
df['tags_list'] = df[['tag_1','tag_2','tag_3']].values.tolist()

In [139]:
# Build tag vocabulary
all_tag_lists = df['tags_list'].tolist()
unique_tags = sorted({tag for tags in all_tag_lists for tag in tags})

In [140]:
# Binarizer
mlb = MultiLabelBinarizer(classes=unique_tags)
mlb.fit(all_tag_lists)

0,1,2
,classes,"['AI', 'API', ...]"
,sparse_output,False


In [141]:
# Train/Val/Test split
texts = df['text'].tolist()
tag_lists = df['tags_list'].tolist()

X_temp, X_test, y_temp, y_test = train_test_split(
    texts, tag_lists, test_size=0.10, random_state=seed, shuffle=True
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.10, random_state=seed, shuffle=True
)

In [142]:
# Keep original lists for sampling
raw_y_train, raw_y_val, raw_y_test = y_train, y_val, y_test

In [143]:
# Binarize labels
y_train = mlb.transform(raw_y_train)
y_val   = mlb.transform(raw_y_val)
y_test  = mlb.transform(raw_y_test)

In [144]:
# Tokenizer & pre-tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

train_enc = tokenizer(
    X_train, padding="max_length", truncation=True,
    max_length=max_length, return_tensors="pt"
)
val_enc = tokenizer(
    X_val, padding="max_length", truncation=True,
    max_length=max_length, return_tensors="pt"
)
test_enc = tokenizer(
    X_test, padding="max_length", truncation=True,
    max_length=max_length, return_tensors="pt"
)

train_labels = torch.tensor(y_train, dtype=torch.float)
val_labels   = torch.tensor(y_val,   dtype=torch.float)
test_labels  = torch.tensor(y_test,  dtype=torch.float)

In [None]:
# Dataset
class TagDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels

    def __len__(self):
        return self.labels.size(0)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels":         self.labels[idx]
        }

train_ds = TagDataset(train_enc, train_labels)
val_ds   = TagDataset(val_enc,   val_labels)
test_ds  = TagDataset(test_enc,  test_labels)

In [146]:
# Sampler for rare tags
flat_tags      = [t for tags in raw_y_train for t in tags]
tag_freq       = Counter(flat_tags)
inv_freq       = {t: 1.0/count for t, count in tag_freq.items()}
sample_weights = [sum(inv_freq[t] for t in tags) for tags in raw_y_train]

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

In [147]:
# DataLoaders
train_loader = DataLoader(
    train_ds, batch_size=batch_size,
    sampler=sampler, num_workers=0, pin_memory=True
)
val_loader = DataLoader(
    val_ds,   batch_size=val_batch_size,
    shuffle=False, num_workers=0
)
test_loader = DataLoader(
    test_ds,  batch_size=test_batch_size,
    shuffle=False, num_workers=0
)

In [148]:
# Model definition
class TagClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_tags):
        super().__init__()
        self.embed     = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1       = nn.Linear(embed_dim, hidden_dim)
        self.relu      = nn.ReLU()
        self.classifier= nn.Linear(hidden_dim, num_tags)

    def forward(self, input_ids, attention_mask):
        # Embed & mask padding tokens
        x = self.embed(input_ids)                          # [B,L,D]
        x = x * attention_mask.unsqueeze(-1)               # [B,L,D]
        summed = x.sum(dim=1)                              # [B,D]
        counts = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg    = summed / counts                           # [B,D]

        h = self.relu(self.fc1(avg))                       # [B,H]
        return self.classifier(h)                          # [B,T] logits

model = TagClassifier(
    vocab_size=tokenizer.vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_tags=len(unique_tags)
).to(device)

In [149]:
# Optimizer & Loss
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn   = nn.BCEWithLogitsLoss()

In [150]:
# Training & Eval Helpers
class EarlyStopping:
    def __init__(self, patience=patience, mode="max", delta=0.0):
        self.patience   = patience
        self.mode       = mode
        self.delta      = delta
        self.best       = None
        self.bad_epochs = 0
        self.should_stop= False

    def step(self, metric):
        if self.best is None:
            self.best = metric
            return
        improved = (metric > self.best + self.delta) if self.mode=="max" \
                   else (metric < self.best - self.delta)
        if improved:
            self.best = metric
            self.bad_epochs = 0
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                self.should_stop = True

def train_epoch():
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        ids, mask, labs = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["labels"].to(device)
        )
        optimizer.zero_grad()
        logits = model(ids, mask)
        loss   = loss_fn(logits, labs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def eval_loader(loader, thresholds=0.5):
    model.eval()
    all_logits, all_true = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            ids, mask, labs = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["labels"].to(device)
            )
            logits = model(ids, mask)
            loss   = loss_fn(logits, labs)
            total_loss += loss.item()
            all_logits.append(logits.cpu().numpy())
            all_true.append(labs.cpu().numpy())

    avg_loss = total_loss / len(loader)
    logits   = np.vstack(all_logits)
    y_true   = np.vstack(all_true)
    probs    = 1/(1 + np.exp(-logits))

    if isinstance(thresholds, (float,int)):
        y_pred = (probs > thresholds).astype(int)
    else:
        y_pred = np.stack([
            (probs[:,i] > thresholds[i]) for i in range(probs.shape[1])
        ], axis=1).astype(int)

    precision = precision_score(y_true, y_pred, average="micro", zero_division=0)
    recall    = recall_score(y_true, y_pred, average="micro", zero_division=0)
    f1        = f1_score(y_true, y_pred, average="micro", zero_division=0)

    return {
        "loss":      avg_loss,
        "precision": precision,
        "recall":    recall,
        "f1":        f1,
        "y_true":    y_true,
        "y_pred":    y_pred
    }

def tune_thresholds(val_logits, val_true):
    best_thresh = []
    for i in range(val_true.shape[1]):
        best_f1, best_t = 0, 0.5
        for t in threshold_grid:
            preds_i = (1/(1+np.exp(-val_logits[:,i])) > t).astype(int)
            f1 = f1_score(val_true[:,i], preds_i, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        best_thresh.append(best_t)
    return np.array(best_thresh)

In [151]:
# Training Loop
best_f1       = -float("inf")
checkpoint    = "best_model.pt"
early_stopper = EarlyStopping()

for epoch in range(1, num_epochs+1):
    print(f"\n→ Epoch {epoch}", flush=True)
    tr_loss = train_epoch()
    val_res = eval_loader(val_loader)
    if val_res["f1"] > best_f1:
        best_f1 = val_res["f1"]
        torch.save(model.state_dict(), checkpoint)
        print(f"  New best Val F1={best_f1:.4f}, checkpoint saved.")
    print(
        f"  Train Loss: {tr_loss:.4f} | "
        f"Val Loss: {val_res['loss']:.4f} | "
        f"Val F1: {val_res['f1']:.4f} | "
        f"P: {val_res['precision']:.4f} | R: {val_res['recall']:.4f}"
    )
    early_stopper.step(val_res["f1"])
    if early_stopper.should_stop:
        print("Early stopping triggered.")
        break


→ Epoch 1
  New best Val F1=0.0922, checkpoint saved.
  Train Loss: 0.0701 | Val Loss: 0.0292 | Val F1: 0.0922 | P: 0.9345 | R: 0.0485

→ Epoch 2
  New best Val F1=0.2442, checkpoint saved.
  Train Loss: 0.0293 | Val Loss: 0.0257 | Val F1: 0.2442 | P: 0.7857 | R: 0.1446

→ Epoch 3
  New best Val F1=0.3157, checkpoint saved.
  Train Loss: 0.0199 | Val Loss: 0.0251 | Val F1: 0.3157 | P: 0.7641 | R: 0.1990

→ Epoch 4
  New best Val F1=0.3651, checkpoint saved.
  Train Loss: 0.0143 | Val Loss: 0.0245 | Val F1: 0.3651 | P: 0.7260 | R: 0.2438

→ Epoch 5
  New best Val F1=0.3996, checkpoint saved.
  Train Loss: 0.0109 | Val Loss: 0.0239 | Val F1: 0.3996 | P: 0.7042 | R: 0.2789

→ Epoch 6
  New best Val F1=0.4439, checkpoint saved.
  Train Loss: 0.0086 | Val Loss: 0.0231 | Val F1: 0.4439 | P: 0.7097 | R: 0.3229

→ Epoch 7
  New best Val F1=0.4713, checkpoint saved.
  Train Loss: 0.0071 | Val Loss: 0.0230 | Val F1: 0.4713 | P: 0.7013 | R: 0.3549

→ Epoch 8
  New best Val F1=0.5062, checkpoint 

In [152]:
# Tune Thresholds on Validation
# Collect val logits & truths
val_logits, val_true = [], []
model.load_state_dict(torch.load(checkpoint, map_location=device, weights_only=True))
model.eval()
with torch.no_grad():
    for batch in val_loader:
        out = model(batch["input_ids"].to(device),
                    batch["attention_mask"].to(device)).cpu().numpy()
        val_logits.append(out)
        val_true.append(batch["labels"].cpu().numpy())
val_logits = np.vstack(val_logits)
val_true   = np.vstack(val_true)

best_thresh = tune_thresholds(val_logits, val_true)
print("Per-tag thresholds:", best_thresh)

Per-tag thresholds: [0.5  0.75 0.5  0.5  0.5  0.5  0.5  0.5  0.25 0.5  0.5  0.5  0.5  0.5
 0.5  0.5  0.35 0.5  0.5  0.5  0.15 0.5  0.5  0.5  0.5  0.5  0.15 0.5
 0.5  0.5  0.5  0.2  0.5  0.1  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5
 0.55 0.5  0.2  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.45 0.5  0.5
 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.55 0.25 0.5  0.5
 0.5  0.3  0.5  0.5  0.5  0.5  0.5  0.5  0.3  0.5  0.5  0.5  0.65 0.5
 0.5  0.45 0.5  0.5  0.5  0.4  0.1  0.5  0.5  0.5  0.25 0.5  0.5  0.5
 0.5  0.5  0.4  0.5  0.5  0.5  0.1  0.5  0.55 0.5  0.5  0.5  0.5  0.5
 0.5  0.5  0.5  0.5  0.45 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5
 0.5  0.2  0.5  0.5  0.5  0.5  0.5  0.5  0.15 0.4  0.5  0.5  0.1  0.1
 0.5  0.5  0.1  0.5  0.5  0.5  0.1  0.5  0.5  0.5  0.4  0.5  0.85 0.5
 0.5  0.5  0.5  0.5  0.15 0.1  0.1  0.5  0.5  0.1  0.5  0.5  0.5  0.5
 0.5  0.5  0.85 0.5  0.5  0.5  0.1  0.5  0.1  0.5  0.5  0.5  0.65 0.5
 0.5  0.5  0.5  0.5  0.5  0.1  0.5  0.5  0.6  0.5  0.5  0.5  0.5  0.5


In [153]:
# Final Test Evaluation
test_res = eval_loader(test_loader, thresholds=best_thresh)
print("\n--- Test Results with tuned thresholds ---")
print(f"Precision: {test_res['precision']:.4f}")
print(f"Recall   : {test_res['recall']:.4f}")
print(f"Micro F1 : {test_res['f1']:.4f}")
macro_f1 = f1_score(test_res['y_true'], test_res['y_pred'], average="macro", zero_division=0)
print(f"Macro F1 : {macro_f1:.4f}\n")

print("Per-tag performance:")
print(classification_report(
    test_res['y_true'], test_res['y_pred'],
    target_names=mlb.classes_, zero_division=0
))


--- Test Results with tuned thresholds ---
Precision: 0.5574
Recall   : 0.6467
Micro F1 : 0.5988
Macro F1 : 0.1002

Per-tag performance:
                           precision    recall  f1-score   support

                       AI       0.00      0.00      0.00         0
                      API       0.00      0.00      0.00         0
          API Integration       0.00      0.00      0.00         0
                      AWS       0.00      0.00      0.00         0
                   Access       0.00      0.00      0.00         1
           Access Control       0.00      0.00      0.00         0
        Access Management       0.00      0.00      0.00         1
            Accessibility       0.00      0.00      0.00         0
                  Account       0.64      0.44      0.52        32
                 Accuracy       1.00      1.00      1.00         1
           ActiveCampaign       0.00      0.00      0.00         1
               Adjustment       0.00      0.00      0.00 