In [40]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch


In [41]:
df = pd.read_csv("Cleaned_Tickets.csv")  # replace with your actual filename
print(df.shape)
print(df.columns)
df.head()

(16337, 14)
Index(['subject', 'body', 'answer', 'type', 'department', 'priority', 'tag_1',
       'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7', 'tag_8'],
      dtype='object')


Unnamed: 0,subject,body,answer,type,department,priority,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
0,Account Disruption,"Dear Customer Support Team, I am writing to re...",Thank you for reaching out. We are aware of th...,Incident,Technical Support,High,Account,Disruption,Outage,Disruption,Outage,IT,Tech Support,
1,Query About Smart Home System Integration Feat...,"Dear Customer Support Team, I hope this messag...",Thank you for your inquiry. Our products suppo...,Request,Returns And Exchanges,Medium,Product,Feature,Tech Support,Feature,Tech Support,,,
2,Inquiry Regarding Invoice Details,"Dear Customer Support Team, I hope this messag...",We appreciate you reaching out with your billi...,Request,Billing And Payments,Low,Billing,Payment,Account,Payment,Account,Documentation,Feedback,
3,Question About Marketing Agency Software Compa...,"Dear Support Team, I hope this message reaches...",Thank you for your inquiry. Our product suppor...,Problem,Sales And Pre-Sales,Medium,Product,Feature,Feedback,Feature,Feedback,Tech Support,,
4,Feature Query,"Dear Customer Support, I hope this message rea...",Thank you for your inquiry. Please specify whi...,Request,Technical Support,High,Feature,Product,Documentation,Product,Documentation,Feedback,,


In [42]:
print(df.isnull().sum())

subject           0
body              0
answer            3
type              0
department        0
priority          0
tag_1             0
tag_2             0
tag_3             0
tag_4          2556
tag_5          2603
tag_6          3988
tag_7          9317
tag_8         13637
dtype: int64


In [43]:
# Flatten all tag columns into one list
tag_columns  = ["tag_1", "tag_2", "tag_3"]
all_tags = df[tag_columns].values.flatten()
all_tags = [tag for tag in all_tags if pd.notnull(tag)]

In [44]:
# Create vocab
tag2id = {tag: i for i, tag in enumerate(sorted(set(all_tags)))}
id2tag = {i: tag for tag, i in tag2id.items()}
num_tags = len(tag2id)

In [45]:
def encode_tags(row):
    tags = [row[col] for col in tag_columns if pd.notnull(row[col])]
    tag_ids = [tag2id[tag] for tag in tags if tag in tag2id]
    multi_hot = np.zeros(num_tags, dtype=np.float32)
    for i in tag_ids:
        multi_hot[i] = 1.0
    return multi_hot

df["tag_vector"] = df.apply(encode_tags, axis=1)

In [46]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [47]:
df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")

In [48]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [49]:
# Max token length in the dataset
token_lens = [len(tokenizer.tokenize(t)) for t in df["text"]]
max(token_lens), sum(l > 256 for l in token_lens)

(221, 0)

In [50]:
def tokenize_texts(text_list):
    return tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

train_enc = tokenize_texts(train_df["text"].tolist())
val_enc   = tokenize_texts(val_df["text"].tolist())
test_enc  = tokenize_texts(test_df["text"].tolist())

In [51]:
class TicketDataset(Dataset):
    def __init__(self, encodings, tag_vectors):
        self.encodings = encodings
        self.tag_vectors = tag_vectors

    def __len__(self):
        return len(self.tag_vectors)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.tag_vectors[idx], dtype=torch.float32)
        }

In [52]:
train_dataset = TicketDataset(train_enc, train_df["tag_vector"].tolist())
val_dataset   = TicketDataset(val_enc,   val_df["tag_vector"].tolist())
test_dataset  = TicketDataset(test_enc,  test_df["tag_vector"].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)

In [53]:
sample = train_dataset[0]  # pick any index
print("Label (multi-hot):", sample["labels"])


Label (multi-hot): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0