In [None]:
from Library import *

## **Data Load**


In [None]:
df = pd.read_csv("translated_data.csv")
tiket_df = pd.DataFrame(df)
tqdm.pandas()

## **`Data Presentation`**


In [None]:
priority = pd.DataFrame(tiket_df["priority"])
priority_order = df["priority"].value_counts().index

plt.figure(figsize=(10, 6))
a = sns.countplot(
    data=df,
    x="priority",
    hue="priority",
    order=priority_order,
    palette=sns.color_palette("mako"),
)
for container in a.containers:
    a.bar_label(container, fmt="%d", label_type="edge", padding=5)
plt.title("Priority Distribution")
plt.xlabel("Priority")
plt.ylabel("Count")

# type
type = pd.DataFrame(tiket_df["type"])
type_order = df["type"].value_counts().index

plt.figure(figsize=(10, 6))
a = sns.countplot(
    data=df,
    x="type",
    hue="type",
    order=type_order,
    palette=sns.color_palette("mako"),
)
for container in a.containers:
    a.bar_label(container, fmt="%d", label_type="edge", padding=5)
plt.title("Type Distribution")
plt.xlabel("Type")
plt.ylabel("Count")

# queue
queue = pd.DataFrame(tiket_df["queue"])
queue_order = df["queue"].value_counts().index

plt.figure(figsize=(10, 6))
a = sns.countplot(
    data=df,
    x="queue",
    hue="queue",
    order=queue_order,
    palette=sns.color_palette("mako"),
)
for container in a.containers:
    a.bar_label(container, fmt="%d", label_type="edge", padding=5)
plt.title("queue Distribution")
plt.xlabel("queue")
plt.ylabel("Count")

## **`Data Splitting`**


In [None]:
# -------------------------------
# 1. Load and prepare your data
# -------------------------------

# Encode categorical labels to numeric form
tiket_df["priority_cat"] = tiket_df["priority"].astype("category").cat.codes
tiket_df["type_cat"] = tiket_df["type"].astype("category").cat.codes
tiket_df["queue_cat"] = tiket_df["queue"].astype("category").cat.codes

# Store the label mappings (for decoding later if needed)
priority_classes = tiket_df["priority"].astype("category").cat.categories
type_classes = tiket_df["type"].astype("category").cat.categories
queue_classes = tiket_df["queue"].astype("category").cat.categories

# -------------------------------
# 2. Split dataset
# -------------------------------
# Here we stratify using one target (priority) to keep distribution balanced
X = tiket_df["translated_body"]
Y = tiket_df[["priority_cat", "type_cat", "queue_cat"]].values

X_train, X_temp, Y_train, Y_temp = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y[:, 0]  # stratify by priority
)

X_val, X_test, Y_val, Y_test = train_test_split(
    X_temp, Y_temp, test_size=0.5, random_state=42, stratify=Y_temp[:, 0]
)

import pandas as pd
import numpy as np

In [None]:
# Convert arrays back to DataFrames
train_df = pd.DataFrame(Y_train, columns=["priority_cat", "type_cat", "queue_cat"])
val_df = pd.DataFrame(Y_val, columns=["priority_cat", "type_cat", "queue_cat"])
test_df = pd.DataFrame(Y_test, columns=["priority_cat", "type_cat", "queue_cat"])


def show_distribution(name, train, val, test, class_names):
    print(f"\nðŸ“Š Distribution for target: {name}")
    print("-" * 50)

    train_dist = pd.Series(train).value_counts(normalize=True).sort_index() * 100
    val_dist = pd.Series(val).value_counts(normalize=True).sort_index() * 100
    test_dist = pd.Series(test).value_counts(normalize=True).sort_index() * 100

    dist_df = pd.DataFrame(
        {
            "Class": class_names,
            "Train (%)": train_dist.round(2).values,
            "Val (%)": val_dist.round(2).values,
            "Test (%)": test_dist.round(2).values,
        }
    ).fillna(0)

    display(dist_df)


# Show for all three targets
show_distribution(
    "Priority",
    train_df["priority_cat"],
    val_df["priority_cat"],
    test_df["priority_cat"],
    priority_classes,
)
show_distribution(
    "Type", train_df["type_cat"], val_df["type_cat"], test_df["type_cat"], type_classes
)
show_distribution(
    "Queue",
    train_df["queue_cat"],
    val_df["queue_cat"],
    test_df["queue_cat"],
    queue_classes,
)

# **`Tokenization`**


In [None]:
# -------------------------------
# 3. Tokenize only the "body" column
# -------------------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


def tokenize_texts(texts):
    texts = [str(t) if not pd.isna(t) else "" for t in texts]
    return tokenizer(
        texts, truncation=True, padding=True, max_length=256, return_tensors="pt"
    )


X_train_tokenized = tokenize_texts(X_train)
X_val_tokenized = tokenize_texts(X_val)
X_test_tokenized = tokenize_texts(X_test)

# -------------------------------
# 4. Prepare label tensors (for each target)
# -------------------------------
y_train_priority = torch.tensor(Y_train[:, 0])
y_train_type = torch.tensor(Y_train[:, 1])
y_train_queue = torch.tensor(Y_train[:, 2])

y_val_priority = torch.tensor(Y_val[:, 0])
y_val_type = torch.tensor(Y_val[:, 1])
y_val_queue = torch.tensor(Y_val[:, 2])

y_test_priority = torch.tensor(Y_test[:, 0])
y_test_type = torch.tensor(Y_test[:, 1])
y_test_queue = torch.tensor(Y_test[:, 2])

In [None]:
# -------------------------------
# 5. Wrap in dicts for dataloaders
# -------------------------------
train_data = {
    "input_ids": X_train_tokenized["input_ids"],
    "attention_mask": X_train_tokenized["attention_mask"],
    "priority": y_train_priority,
    "type": y_train_type,
    "queue": y_train_queue,
}


val_data = {
    "input_ids": X_val_tokenized["input_ids"],
    "attention_mask": X_val_tokenized["attention_mask"],
    "priority": y_val_priority,
    "type": y_val_type,
    "queue": y_val_queue,
}

test_data = {
    "input_ids": X_test_tokenized["input_ids"],
    "attention_mask": X_test_tokenized["attention_mask"],
    "priority": y_test_priority,
    "type": y_test_type,
    "queue": y_test_queue,
}
torch.save(train_data, "train_data.pt")
torch.save(val_data, "validation_data.pt")
torch.save(test_data, "test_data.pt")

print(train_data["input_ids"].shape)
print(f"Train size: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

### decoded preview


In [None]:
decoded_texts = tokenizer.batch_decode(
    train_data["input_ids"][:5], skip_special_tokens=False
)

# Build a DataFrame for human-readable inspection
train_preview = pd.DataFrame(
    {
        "decoded_body": decoded_texts,
        "input_ids": train_data["input_ids"][:5].tolist(),
        "attention_mask": train_data["attention_mask"][:5].tolist(),
        "priority": train_data["priority"][:5].tolist(),
        "type": train_data["type"][:5].tolist(),
        "queue": train_data["queue"][:5].tolist(),
    }
)

# Display neatly
pd.set_option("max_colwidth", 80)
display(train_preview.head(5))

In [None]:
# check imbalance ratio
def imbalance_ratio(series):
    counts = series.value_counts()
    return counts.max() / counts.min()


targets = ["priority_cat", "type_cat", "queue_cat"]

for t in targets:
    ratio = imbalance_ratio(train_df[t])
    print(f"{t.capitalize()} imbalance ratio: {ratio:.2f}")
    print(train_df[t].value_counts(), "\n")

# **Class Weighting**


In [None]:
# t class weights
type_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train_type),
    y=y_train_type.numpy(),
)
type_weights = torch.tensor(type_weights, dtype=torch.float)

# Queue class weights
queue_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_queue), y=y_train_queue.numpy()
)
queue_weights = torch.tensor(queue_weights, dtype=torch.float)

print("Type class weights:", type_weights)
print("Queue class weights:", queue_weights)

# saving value
type_weights = torch.tensor(type_weights, dtype=torch.float)
queue_weights = torch.tensor(queue_weights, dtype=torch.float)
torch.save(type_weights, "type_weights.pt")
torch.save(queue_weights, "queue_weights.pt")

# --- Display neatly ---
type_weight_df = pd.DataFrame(
    {"Priority_Class": type_classes, "Weight": type_weights.numpy()}
)

queue_weight_df = pd.DataFrame(
    {"Queue_Class": queue_classes, "Weight": queue_weights.numpy()}
)

print("Priority Class Weights")
display(type_weight_df)

print("Queue Class Weights")
display(queue_weight_df)

# **`embedding`**


In [None]:
from transformers import BertModel

print("library done")
# Load model
bert_model = BertModel.from_pretrained("bert-base-cased")
print("load bert done")
bert_model.eval()
print("load eval done")
# # Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Prepare batches
batch_size = 16
input_ids = X_train_tokenized["input_ids"]
attention_mask = X_train_tokenized["attention_mask"]
print("batching done")
cls_embeddings_list = []

# Disable gradient computation and iterate with progress bar
with torch.no_grad():
    for i in tqdm(
        range(0, len(input_ids), batch_size), desc="Embedding texts with BERT"
    ):
        batch_input_ids = input_ids[i : i + batch_size].to(device)
        batch_attention_mask = attention_mask[i : i + batch_size].to(device)

        outputs = bert_model(
            input_ids=batch_input_ids, attention_mask=batch_attention_mask
        )

        # Take [CLS] token embedding
        batch_cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
        cls_embeddings_list.append(batch_cls_embeddings)

# Concatenate all embeddings
cls_embeddings = torch.cat(cls_embeddings_list, dim=0)
filename = "cls_embeddings.pt"
torch.save(cls_embeddings, filename)

In [None]:
filename = "cls_embeddings.pt"
loaded_cls_embeddings = torch.load(filename)
print(f"Embeddings successfully loaded. Shape: {loaded_cls_embeddings.shape}")

# Build DataFrame
train_embeddings_df = pd.DataFrame(cls_embeddings.numpy())
train_embeddings_df["priority"] = y_train_priority.numpy()
train_embeddings_df["type"] = y_train_type.numpy()
train_embeddings_df["queue"] = y_train_queue.numpy()

train_embeddings_df.value_counts

# **Fine-Tuning**


## **Load Model**


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel


class MultiOutputBERT(nn.Module):
    def __init__(
        self, model_name, num_priority_classes, num_type_classes, num_queue_classes
    ):
        super(MultiOutputBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)  # static dropout for baseline

        # Each output head corresponds to one classification target
        self.priority_head = nn.Linear(
            self.bert.config.hidden_size, num_priority_classes
        )
        self.type_head = nn.Linear(self.bert.config.hidden_size, num_type_classes)
        self.queue_head = nn.Linear(self.bert.config.hidden_size, num_queue_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        cls_output = self.dropout(cls_output)

        priority_logits = self.priority_head(cls_output)
        type_logits = self.type_head(cls_output)
        queue_logits = self.queue_head(cls_output)

        return priority_logits, type_logits, queue_logits

## **Train loop**


In [None]:
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Static hyperparameters
LEARNING_RATE = 2e-5
EPOCHS = 3
BATCH_SIZE = 16
DROPOUT = 0.1
RANDOM_SEED = 42

# Initialize model
num_priority_classes = len(priority_classes)
num_type_classes = len(type_classes)
num_queue_classes = len(queue_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiOutputBERT(
    model_name="bert-base-cased",
    num_priority_classes=num_priority_classes,
    num_type_classes=num_type_classes,
    num_queue_classes=num_queue_classes,
).to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Loss functions (weighted for imbalance)
type_weights = torch.tensor(type_weights, dtype=torch.float32).to(device)
queue_weights = torch.tensor(queue_weights, dtype=torch.float32).to(device)

# load torch weights
type_weights = torch.load("type_weights.pt")
queue_weights = torch.load("queue_weights.pt")

criterion_priority = nn.CrossEntropyLoss()
criterion_type = nn.CrossEntropyLoss(weight=type_weights.to(device))
criterion_queue = nn.CrossEntropyLoss(weight=queue_weights.to(device))


train_dataset = TensorDataset(
    X_train_tokenized["input_ids"],
    X_train_tokenized["attention_mask"],
    y_train_priority.long(),
    y_train_type.long(),
    y_train_queue.long(),
)

val_dataset = TensorDataset(
    X_val_tokenized["input_ids"],
    X_val_tokenized["attention_mask"],
    y_val_priority.long(),
    y_val_type.long(),
    y_val_queue.long(),
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
from transformers import trainer

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")

    # ---- Training Phase ----
    model.train()
    total_train_loss = 0

    train_progress = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_progress:
        batch = [item.to(device) for item in batch]
        input_ids, attention_mask, priority, type_, queue = batch

        optimizer.zero_grad()
        logits_priority, logits_type, logits_queue = model(input_ids, attention_mask)

        loss = (
            criterion_priority(logits_priority, priority)
            + criterion_type(logits_type, type_)
            + criterion_queue(logits_queue, queue)
        )
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        train_progress.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print("train done")

    # eval loop
    model.eval()
    total_val_loss = 0
    print("load val")
    val_progress = tqdm(val_loader, desc="Validating", leave=False)
    with torch.no_grad():
        for batch in val_progress:
            batch = [item.to(device) for item in batch]
            input_ids, attention_mask, priority, type_, queue = batch

            logits_priority, logits_type, logits_queue = model(
                input_ids, attention_mask
            )
            val_loss = (
                criterion_priority(logits_priority, priority)
                + criterion_type(logits_type, type_)
                + criterion_queue(logits_queue, queue)
            )

            total_val_loss += val_loss.item()
            val_progress.set_postfix(val_loss=val_loss.item())

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(
        f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f}"
    )

In [None]:
# -----------------------------
# Plot Learning Curve
# -----------------------------
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss", marker="o")
plt.plot(val_losses, label="Validation Loss", marker="s")
plt.title("Learning Curve - Multi-Output BERT")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()