In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_excel("helpdesk_datasetxlsx")

df.head()


Unnamed: 0,text,category,type,priority
0,My laptop is overheating and shuts down freque...,Hardware Issue,Overheating,High
1,Laptop keyboard is not responding properly.,Hardware Issue,Keyboard Failure,Medium
2,Monitor display is flickering and showing lines.,Hardware Issue,Display Malfunction,High
3,Hard disk making clicking noises and failing t...,Hardware Issue,Hard Disk Error,Critical
4,Battery drains very fast even when not in use.,Hardware Issue,Battery Issue,Medium


In [3]:
cat_encoder = LabelEncoder()
type_encoder = LabelEncoder()
prio_encoder = LabelEncoder()

df["category_encoded"] = cat_encoder.fit_transform(df["category"])
df["type_encoded"] = type_encoder.fit_transform(df["type"])
df["priority_encoded"] = prio_encoder.fit_transform(df["priority"])


In [4]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.shape, val_df.shape


((48, 7), (12, 7))

In [5]:
class HelpdeskDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["text"]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "category": torch.tensor(row["category_encoded"], dtype=torch.long),
            "type": torch.tensor(row["type_encoded"], dtype=torch.long),
            "priority": torch.tensor(row["priority_encoded"], dtype=torch.long)
        }


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = HelpdeskDataset(train_df, tokenizer)
val_dataset = HelpdeskDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [7]:
class MultiTaskModel(nn.Module):
    def __init__(self, n_cat, n_type, n_prio):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        self.dropout = nn.Dropout(0.2)
        
        # Three separate output heads
        self.category_head = nn.Linear(768, n_cat)
        self.type_head = nn.Linear(768, n_type)
        self.priority_head = nn.Linear(768, n_prio)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        
        cls_output = self.dropout(cls_output)
        
        return (
            self.category_head(cls_output),
            self.type_head(cls_output),
            self.priority_head(cls_output)
        )


In [8]:
model = MultiTaskModel(
    n_cat=len(cat_encoder.classes_),
    n_type=len(type_encoder.classes_),
    n_prio=len(prio_encoder.classes_)
)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


In [12]:
EPOCHS = 15

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        
        optimizer.zero_grad()
        
        cat_logits, type_logits, prio_logits = model(
            batch["input_ids"],
            batch["attention_mask"]
        )
        
        loss_cat = loss_fn(cat_logits, batch["category"])
        loss_type = loss_fn(type_logits, batch["type"])
        loss_prio = loss_fn(prio_logits, batch["priority"])
        
        loss = loss_cat + loss_type + loss_prio
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")


Epoch 1/15, Loss: 33.8695
Epoch 2/15, Loss: 30.9557
Epoch 3/15, Loss: 28.8137
Epoch 4/15, Loss: 26.1264
Epoch 5/15, Loss: 24.2571
Epoch 6/15, Loss: 22.4835
Epoch 7/15, Loss: 21.1793
Epoch 8/15, Loss: 19.2475
Epoch 9/15, Loss: 17.4902
Epoch 10/15, Loss: 15.6840
Epoch 11/15, Loss: 14.9437
Epoch 12/15, Loss: 14.0249
Epoch 13/15, Loss: 12.9168
Epoch 14/15, Loss: 12.2029
Epoch 15/15, Loss: 11.4751


In [13]:
def predict(text):
    model.eval()
    
    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        cat_logits, type_logits, prio_logits = model(
            enc["input_ids"],
            enc["attention_mask"]
        )
    
    category = cat_encoder.inverse_transform([cat_logits.argmax().item()])[0]
    type_ = type_encoder.inverse_transform([type_logits.argmax().item()])[0]
    priority = prio_encoder.inverse_transform([prio_logits.argmax().item()])[0]
    
    return {
        "category": category,
        "type": type_,
        "priority": priority
    }


In [14]:
predict("My laptop overheats and turns off automatically.")


{'category': 'VPN Issue', 'type': 'Keyboard Failure', 'priority': 'High'}