In [22]:
import pandas as pd

In [23]:
df_tags = pd.read_csv("Cleaned_Tickets.csv")

In [24]:
# Combine subject + body into one 'text' column
df_tags['text'] = df_tags['subject'].str.strip() + " " + df_tags['body'].str.strip()

In [25]:
# Combine the three tag columns into a Python list
df_tags['tags_list'] = df_tags[['tag_1','tag_2','tag_3']].values.tolist()

In [26]:
# Gather every tag in the corpus
all_lists = df_tags['tags_list'].tolist()
full_tags = sorted({t for tags in all_lists for t in tags})

In [27]:
# Instantiate the binarizer with that fixed class list
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=full_tags)

In [28]:
# Register the classes (no leakage: only registers names, not counts)
mlb.fit(all_lists)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
texts = df_tags['text'].tolist()
tag_lists = df_tags['tags_list'].tolist()

In [31]:
# Hold out 10% for final testing
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, tag_lists,
    test_size=0.10,
    random_state=42,
    shuffle=True
)

# From the 90% remainder, hold out 10% for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.10,
    random_state=42,
    shuffle=True
)

In [32]:
# Transform train/val/test lists into binary matrices
y_train = mlb.transform(y_train)
y_val   = mlb.transform(y_val)
y_test  = mlb.transform(y_test)


In [33]:
from transformers import BertTokenizer

In [34]:
# Load the cased tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [35]:
# Tokenization helper (batch)
def tokenize_batch(texts, max_length=256):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True,
        return_tensors="pt"
    )

In [36]:
batch = tokenize_batch(X_train[:4])

In [37]:
import torch
from torch.utils.data import Dataset, DataLoader

class TagDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [38]:
# Instantiate datasets
train_ds = TagDataset(X_train, y_train, tokenizer)
val_ds   = TagDataset(X_val,   y_val,   tokenizer)
test_ds  = TagDataset(X_test,  y_test,  tokenizer)

In [39]:
# DataLoaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(test_ds,  batch_size=64)

In [40]:
import torch.nn as nn

class SubwordAvgClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_tags):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc    = nn.Linear(embed_dim, num_tags)

    def forward(self, input_ids, attention_mask):
        x = self.embed(input_ids)                           # [B, L, D]
        x = x * attention_mask.unsqueeze(-1)                # mask padding
        summed = x.sum(dim=1)                               # [B, D]
        counts = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg    = summed / counts                            # [B, D]
        return self.fc(avg)                                 # [B, num_tags]

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SubwordAvgClassifier(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    num_tags=len(mlb.classes_)
).to(device)


In [41]:
from torch.optim import AdamW
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# 8.1 EarlyStopping helper
class EarlyStopping:
    def __init__(self, patience=3, mode="max", delta=0.0):
        self.patience = patience
        self.mode = mode
        self.delta = delta
        self.best = None
        self.bad_epochs = 0
        self.should_stop = False

    def step(self, metric):
        if self.best is None:
            self.best = metric
            return
        improved = (metric > self.best + self.delta) if self.mode == "max" \
                   else (metric < self.best - self.delta)
        if improved:
            self.best = metric
            self.bad_epochs = 0
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                self.should_stop = True

# 8.2 Optimizer & loss
optimizer = AdamW(model.parameters(), lr=1e-3)
loss_fn   = nn.BCEWithLogitsLoss()

# 8.3 Training epoch
def train_epoch():
    model.train()
    for batch in train_loader:
        ids, mask, labs = (batch["input_ids"].to(device),
                           batch["attention_mask"].to(device),
                           batch["labels"].to(device))
        optimizer.zero_grad()
        logits = model(ids, mask)
        loss   = loss_fn(logits, labs)
        loss.backward()
        optimizer.step()

# 8.4 Evaluation on any loader
def eval_loader(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            ids, mask = (batch["input_ids"].to(device),
                         batch["attention_mask"].to(device))
            labs   = batch["labels"].cpu().numpy()
            logits = model(ids, mask).cpu().numpy()
            preds  = (logits > 0).astype(int)
            all_labels.append(labs)
            all_preds.append(preds)
    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    return {
        "precision": precision_score(y_true, y_pred, average="micro"),
        "recall":    recall_score(y_true, y_pred, average="micro"),
        "f1":        f1_score(y_true, y_pred, average="micro"),
    }


In [None]:
early_stopper = EarlyStopping(patience=2, mode="max", delta=1e-4)

for epoch in range(1, 50):
    train_epoch()
    metrics = eval_loader(val_loader)
    print(f"Epoch {epoch} → Val metrics:", metrics)
    early_stopper.step(metrics["f1"])
    if early_stopper.should_stop:
        print(f"Stopping early at epoch {epoch}.")
        break

# 9.1 Final evaluation on the test set
test_metrics = eval_loader(test_loader)
print("Test set metrics:", test_metrics)