In [11]:
# ============================
# Stage 2: Fine-tuning Encoder
# ============================

# 1) Imports
import os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm.auto import tqdm

# ----------------------------
# 2) Config
# ----------------------------
SEED = 42
MODEL_NAME = "distilbert-base-uncased"
DATA_PATH  = "Cleaned_Tickets.csv"
MAX_LEN    = 256
BATCH_SIZE = 16
EPOCHS_FINE = 4
PATIENCE   = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

# ----------------------------
# 3) Reload dataset
# ----------------------------
df = pd.read_csv(DATA_PATH)
df["subject"] = df["subject"].fillna("")
df["body"]    = df["body"].fillna("")
df["text"]    = "[SUBJ] " + df["subject"] + " [BODY] " + df["body"]

type_classes = sorted(df["type"].unique())
dept_classes = sorted(df["department"].unique())
prio_classes = sorted(df["priority"].unique())
type2id = {c:i for i,c in enumerate(type_classes)}
dept2id = {c:i for i,c in enumerate(dept_classes)}
prio2id = {c:i for i,c in enumerate(prio_classes)}

tag_cols = [c for c in df.columns if c.startswith("tag_")]
df["tags_list"] = df[tag_cols].fillna("").values.tolist()
df["tags_list"] = df["tags_list"].apply(lambda tags: [t for t in tags if t != ""])
mlb = MultiLabelBinarizer()
mlb.fit(df["tags_list"])
tag_classes = mlb.classes_

# split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df["type"])
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# dataset
class TicketDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, type2id, dept2id, prio2id, mlb):
        self.texts = df["text"].tolist()
        self.type_labels = df["type"].map(type2id).tolist()
        self.dept_labels = df["department"].map(dept2id).tolist()
        self.prio_labels = df["priority"].map(prio2id).tolist()
        self.tags_labels = mlb.transform(df["tags_list"]).astype(np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "y_type": torch.tensor(self.type_labels[idx], dtype=torch.long),
            "y_dept": torch.tensor(self.dept_labels[idx], dtype=torch.long),
            "y_prio": torch.tensor(self.prio_labels[idx], dtype=torch.long),
            "y_tags": torch.tensor(self.tags_labels[idx], dtype=torch.float32)
        }

train_loader = DataLoader(TicketDataset(train_df, tokenizer, MAX_LEN, type2id, dept2id, prio2id, mlb),
                          batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TicketDataset(val_df, tokenizer, MAX_LEN, type2id, dept2id, prio2id, mlb),
                          batch_size=BATCH_SIZE)

# ----------------------------
# 4) Model
# ----------------------------
class Head(nn.Module):
    def __init__(self, in_dim, out_dim, p=0.2):
        super().__init__()
        self.seq = nn.Sequential(
            nn.LayerNorm(in_dim),
            nn.Dropout(p),
            nn.Linear(in_dim, in_dim),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(in_dim, out_dim),
        )
    def forward(self, x): return self.seq(x)

class MultiTaskModel(nn.Module):
    def __init__(self, model_name=MODEL_NAME, dropout=0.2, freeze_encoder=False,
                 n_type=len(type_classes), n_dept=len(dept_classes),
                 n_prio=len(prio_classes), n_tags=len(tag_classes)):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        if freeze_encoder:
            for p in self.encoder.parameters():
                p.requires_grad = False
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.type_head = Head(hidden, n_type, p=0.2)
        self.dept_head = Head(hidden, n_dept, p=0.2)
        self.prio_head = Head(hidden, n_prio, p=0.2)
        self.tags_head = Head(hidden, n_tags, p=0.2)
    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden = out.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(hidden.size()).float()
        summed = torch.sum(hidden * mask, 1)
        counts = torch.clamp(mask.sum(1), min=1e-9)
        pooled = summed / counts
        pooled = self.dropout(pooled)
        return {
            "type": self.type_head(pooled),
            "department": self.dept_head(pooled),
            "priority": self.prio_head(pooled),
            "tags": self.tags_head(pooled)
        }

# ----------------------------
# 5) Load Stage 1 checkpoint
# ----------------------------
checkpoint = torch.load("multitask_best.pt", map_location=device, weights_only=False)

model = MultiTaskModel(
    freeze_encoder=False,
    n_type=len(type_classes), n_dept=len(dept_classes),
    n_prio=len(prio_classes), n_tags=len(tag_classes)
).to(device)
model.load_state_dict(checkpoint["model_state"])

# ----------------------------
# 6) Unfreeze top encoder layers
# ----------------------------
for name, p in model.encoder.named_parameters():
    if "layer.5" in name or "layer.4" in name:
        p.requires_grad = True
    else:
        p.requires_grad = False

# ----------------------------
# 7) Loss functions
# ----------------------------
weights_type = compute_class_weight("balanced", classes=np.arange(len(type_classes)), y=train_df["type"].map(type2id))
weights_dept = compute_class_weight("balanced", classes=np.arange(len(dept_classes)), y=train_df["department"].map(dept2id))
weights_prio = compute_class_weight("balanced", classes=np.arange(len(prio_classes)), y=train_df["priority"].map(prio2id))

criterion_type = nn.CrossEntropyLoss(weight=torch.tensor(weights_type, dtype=torch.float32).to(device))
criterion_dept = nn.CrossEntropyLoss(weight=torch.tensor(weights_dept, dtype=torch.float32).to(device))
criterion_prio = nn.CrossEntropyLoss(weight=torch.tensor(weights_prio, dtype=torch.float32).to(device))

train_tags = mlb.transform(train_df["tags_list"])
pos = train_tags.sum(axis=0); neg = train_tags.shape[0] - pos
pos_weight_raw = (neg / np.maximum(pos, 1))
pos_weight = np.clip(pos_weight_raw, 1.0, 10.0)
criterion_tags = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, dtype=torch.float32).to(device))

ALPHA, BETA, GAMMA, DELTA = 1.0, 1.0, 1.0, 1.2

# ----------------------------
# 8) Optimizer
# ----------------------------
encoder_params = [p for n,p in model.named_parameters() if p.requires_grad and "encoder" in n]
head_params    = [p for n,p in model.named_parameters() if p.requires_grad and "encoder" not in n]

optimizer = torch.optim.AdamW([
    {"params": encoder_params, "lr": 5e-6},
    {"params": head_params, "lr": 3e-5},
])

# ----------------------------
# 9) Training loop
# ----------------------------
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, labels = [], {"type":[], "department":[], "priority":[], "tags":[]}, {"type":[], "department":[], "priority":[], "tags":[]}
    for batch in tqdm(loader, leave=False):
        input_ids, attn = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        y_type, y_dept, y_prio, y_tags = batch["y_type"].to(device), batch["y_dept"].to(device), batch["y_prio"].to(device), batch["y_tags"].to(device)
        with torch.set_grad_enabled(train):
            out = model(input_ids, attn)
            loss = (ALPHA*criterion_type(out["type"], y_type) +
                    BETA*criterion_dept(out["department"], y_dept) +
                    GAMMA*criterion_prio(out["priority"], y_prio) +
                    DELTA*criterion_tags(out["tags"], y_tags))
            if train:
                optimizer.zero_grad(); loss.backward(); optimizer.step()
        losses.append(loss.item())
        preds["type"].append(torch.argmax(out["type"],1).detach().cpu().numpy()); labels["type"].append(y_type.cpu().numpy())
        preds["department"].append(torch.argmax(out["department"],1).detach().cpu().numpy()); labels["department"].append(y_dept.cpu().numpy())
        preds["priority"].append(torch.argmax(out["priority"],1).detach().cpu().numpy()); labels["priority"].append(y_prio.cpu().numpy())
        preds["tags"].append(torch.sigmoid(out["tags"]).detach().cpu().numpy()); labels["tags"].append(y_tags.cpu().numpy())
    for k in preds: preds[k] = np.concatenate(preds[k]); labels[k] = np.concatenate(labels[k])
    return np.mean(losses), preds, labels

best_f1, patience = -1, PATIENCE
for epoch in range(1, EPOCHS_FINE+1):
    tr_loss, tr_preds, tr_labels = run_epoch(train_loader, train=True)
    va_loss, va_preds, va_labels = run_epoch(val_loader, train=False)

    f1_type = f1_score(va_labels["type"], va_preds["type"], average="macro")
    f1_dept = f1_score(va_labels["department"], va_preds["department"], average="macro")
    f1_prio = f1_score(va_labels["priority"], va_preds["priority"], average="macro")
    va_bin = (va_preds["tags"] >= checkpoint["tag_thresholds"]).astype(int)
    f1_tags = f1_score(va_labels["tags"], va_bin, average="micro")

    print(f"[FineTune] Epoch {epoch} | train loss {tr_loss:.4f} | val loss {va_loss:.4f}")
    print(f"   F1(type)={f1_type:.3f} | F1(dept)={f1_dept:.3f} | F1(prio)={f1_prio:.3f} | F1(tags)={f1_tags:.3f}")

    avg_f1 = (f1_type+f1_dept+f1_prio+f1_tags)/4
    if avg_f1 > best_f1:
        best_f1 = avg_f1
        torch.save({
            "model_state": model.state_dict(),
            "tokenizer_name": MODEL_NAME,
            "max_len": MAX_LEN,
            "type_classes": type_classes,
            "dept_classes": dept_classes,
            "prio_classes": prio_classes,
            "tag_classes": list(tag_classes),
            "tag_thresholds": checkpoint["tag_thresholds"]
        }, "multitask_finetuned.pt")
        patience = PATIENCE
    else:
        patience -= 1
        if patience < 0:
            print("Early stopping."); break

print("Fine-tuning complete. Best avg F1:", best_f1)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


  0%|          | 0/817 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

[FineTune] Epoch 1 | train loss 3.1788 | val loss 3.2375
   F1(type)=0.760 | F1(dept)=0.293 | F1(prio)=0.423 | F1(tags)=0.552


  0%|          | 0/817 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

[FineTune] Epoch 2 | train loss 3.0849 | val loss 3.1667
   F1(type)=0.809 | F1(dept)=0.305 | F1(prio)=0.430 | F1(tags)=0.545


  0%|          | 0/817 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

[FineTune] Epoch 3 | train loss 3.0154 | val loss 3.1554
   F1(type)=0.821 | F1(dept)=0.291 | F1(prio)=0.444 | F1(tags)=0.553


  0%|          | 0/817 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

[FineTune] Epoch 4 | train loss 2.9479 | val loss 3.1083
   F1(type)=0.825 | F1(dept)=0.306 | F1(prio)=0.437 | F1(tags)=0.551
Fine-tuning complete. Best avg F1: 0.5297076522309223


In [13]:
# ============================
# Final Evaluation (Validation)
# ============================
from sklearn.metrics import classification_report

# Reload fine-tuned checkpoint
checkpoint_ft = torch.load("multitask_finetuned.pt", map_location=device, weights_only=False)

model = MultiTaskModel(
    freeze_encoder=False,
    n_type=len(checkpoint_ft["type_classes"]),
    n_dept=len(checkpoint_ft["dept_classes"]),
    n_prio=len(checkpoint_ft["prio_classes"]),
    n_tags=len(checkpoint_ft["tag_classes"])
).to(device)

model.load_state_dict(checkpoint_ft["model_state"])
model.eval()

# Run inference on validation set
_, va_preds, va_labels = run_epoch(val_loader, train=False)

# ----------------------------
# 1) Reports for type/dept/prio
# ----------------------------
print("\n=== TYPE REPORT ===")
print(classification_report(
    va_labels["type"], va_preds["type"],
    target_names=checkpoint_ft["type_classes"]
))

print("\n=== DEPARTMENT REPORT ===")
print(classification_report(
    va_labels["department"], va_preds["department"],
    target_names=checkpoint_ft["dept_classes"]
))

print("\n=== PRIORITY REPORT ===")
print(classification_report(
    va_labels["priority"], va_preds["priority"],
    target_names=checkpoint_ft["prio_classes"]
))

# ----------------------------
# 2) Metrics for tags
# ----------------------------
def tune_tag_thresholds(Y_true, Y_prob, grid=np.linspace(0.1,0.9,9)):
    T = Y_true.shape[1]
    best = np.full(T, 0.5, dtype=np.float32)
    for t in range(T):
        yt = Y_true[:, t]
        if yt.sum() == 0:  # skip tags not present
            continue
        yp = Y_prob[:, t]
        best_f, best_thr = -1.0, 0.5
        for thr in grid:
            f1 = f1_score(yt, (yp >= thr).astype(int), zero_division=0)
            if f1 > best_f:
                best_f, best_thr = f1, thr
        best[t] = best_thr
    return best

val_probs = va_preds["tags"]
val_true  = va_labels["tags"]

tag_thresholds = tune_tag_thresholds(val_true, val_probs)
val_bin = (val_probs >= tag_thresholds).astype(int)

f1_micro = f1_score(val_true, val_bin, average="micro")
f1_macro = f1_score(val_true, val_bin, average="macro")

print("\n=== TAGS REPORT ===")
print(f"F1-micro: {f1_micro:.3f} | F1-macro: {f1_macro:.3f}")




  0%|          | 0/205 [00:00<?, ?it/s]


=== TYPE REPORT ===
              precision    recall  f1-score   support

      Change       0.95      1.00      0.97       341
    Incident       0.79      0.74      0.76      1314
     Problem       0.55      0.61      0.58       680
     Request       1.00      0.98      0.99       933

    accuracy                           0.81      3268
   macro avg       0.82      0.83      0.83      3268
weighted avg       0.81      0.81      0.81      3268


=== DEPARTMENT REPORT ===
                                 precision    recall  f1-score   support

           Billing And Payments       0.62      0.75      0.68       327
               Customer Service       0.23      0.18      0.20       434
                General Inquiry       0.08      0.37      0.14        51
                Human Resources       0.19      0.28      0.22        76
                     It Support       0.23      0.24      0.24       345
                Product Support       0.35      0.16      0.22       644
     

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
