In [1]:
!pip install -q kagglehub transformers openai pandas


In [2]:
!pip install -q transformers datasets scikit-learn


In [3]:
import kagglehub
import pandas as pd

# Download dataset
path = kagglehub.dataset_download("suraj520/customer-support-ticket-dataset")
print("Dataset path:", path)



Downloading from https://www.kaggle.com/api/v1/datasets/download/suraj520/customer-support-ticket-dataset?dataset_version_number=1...


100%|██████████| 828k/828k [00:00<00:00, 1.13MB/s]

Extracting files...
Dataset path: /root/.cache/kagglehub/datasets/suraj520/customer-support-ticket-dataset/versions/1





In [4]:
import pandas as pd

# Load your preprocessed dataset (assuming combined subject + description)
df = pd.read_csv("/root/.cache/kagglehub/datasets/suraj520/customer-support-ticket-dataset/versions/1/customer_support_tickets.csv")

# Combine subject + description into a single field
df['ticket_text'] = df['Ticket Subject'].fillna('') + " — " + df['Ticket Description'].fillna('')
df['label'] = df['Ticket Type']

# Drop missing and duplicates
df = df[['ticket_text', 'label']].dropna().drop_duplicates().reset_index(drop=True)

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

df.head()


Unnamed: 0,ticket_text,label,label_encoded
0,Product setup — I'm having an issue with the {...,Technical issue,4
1,Peripheral compatibility — I'm having an issue...,Technical issue,4
2,Network problem — I'm facing a problem with my...,Technical issue,4
3,Account access — I'm having an issue with the ...,Billing inquiry,0
4,Data loss — I'm having an issue with the {prod...,Billing inquiry,0


In [5]:
from sklearn.model_selection import train_test_split

# Split into train/val
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_encoded'], random_state=42)

# Convert to Hugging Face format
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[['ticket_text', 'label_encoded']])
val_ds = Dataset.from_pandas(val_df[['ticket_text', 'label_encoded']])


In [6]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Split into train/validation
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_encoded'], random_state=42)

# Convert to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df[['ticket_text', 'label_encoded']])
val_ds = Dataset.from_pandas(val_df[['ticket_text', 'label_encoded']])


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["ticket_text"], truncation=True, padding="max_length", max_length=128)
# After converting to Hugging Face datasets
train_ds = Dataset.from_pandas(train_df[['ticket_text', 'label_encoded']])
val_ds = Dataset.from_pandas(val_df[['ticket_text', 'label_encoded']])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Rename label_encoded to labels in both datasets
train_ds = train_ds.rename_column("label_encoded", "labels")
val_ds = val_ds.rename_column("label_encoded", "labels")


In [9]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)


Map:   0%|          | 0/6741 [00:00<?, ? examples/s]

Map:   0%|          | 0/1686 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification

# Define number of classes
num_labels = df['label_encoded'].nunique()

# Load DistilBERT model for classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,

)


  trainer = Trainer(


In [17]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamnarahman00[0m ([33mamnarahman[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.6175
20,1.616
30,1.5971
40,1.614
50,1.6057
60,1.6032
70,1.6049
80,1.6278
90,1.6154
100,1.613


TrainOutput(global_step=1266, training_loss=1.6110887248745838, metrics={'train_runtime': 262.0563, 'train_samples_per_second': 77.17, 'train_steps_per_second': 4.831, 'total_flos': 669757881381120.0, 'train_loss': 1.6110887248745838, 'epoch': 3.0})

In [18]:
eval_results = trainer.evaluate()
print("📊 Evaluation Results:", eval_results)


📊 Evaluation Results: {'eval_loss': 1.6094436645507812, 'eval_runtime': 6.0427, 'eval_samples_per_second': 279.016, 'eval_steps_per_second': 17.542, 'epoch': 3.0}


In [19]:
import numpy as np

# Get model predictions (logits)
pred_logits = trainer.predict(val_ds).predictions

# Get top 3 predicted label indices
top3_indices = np.argsort(pred_logits, axis=1)[:, -3:][:, ::-1]

# Decode label indices back to original tags
top3_labels = [[le.inverse_transform([i])[0] for i in row] for row in top3_indices]

# Add to val_df for comparison
val_df = val_df.reset_index(drop=True)
val_df['Top3_Predicted'] = top3_labels

val_df[['ticket_text', 'label', 'Top3_Predicted']].head(10)


Unnamed: 0,ticket_text,label,Top3_Predicted
0,Installation support — I've forgotten my passw...,Cancellation request,"[Refund request, Technical issue, Cancellation..."
1,Cancellation request — I'm having an issue wit...,Cancellation request,"[Technical issue, Cancellation request, Refund..."
2,Hardware issue — I've accidentally deleted imp...,Product inquiry,"[Refund request, Technical issue, Cancellation..."
3,Software bug — I'm having an issue with the {p...,Technical issue,"[Refund request, Technical issue, Cancellation..."
4,Display issue — My {product_purchased} is maki...,Billing inquiry,"[Refund request, Technical issue, Cancellation..."
5,Battery life — I'm having an issue with the {p...,Refund request,"[Refund request, Technical issue, Cancellation..."
6,Account access — I'm having an issue with the ...,Product inquiry,"[Refund request, Product inquiry, Technical is..."
7,Refund request — I'm having an issue with the ...,Refund request,"[Technical issue, Cancellation request, Produc..."
8,Display issue — I'm having an issue with the {...,Refund request,"[Refund request, Product inquiry, Technical is..."
9,Cancellation request — I'm having an issue wit...,Billing inquiry,"[Refund request, Technical issue, Cancellation..."


In [20]:
val_df.to_csv("fine_tuned_ticket_predictions.csv", index=False)
print("✅ Saved to fine_tuned_ticket_predictions.csv")


✅ Saved to fine_tuned_ticket_predictions.csv


In [21]:
val_df['Match_FineTuned'] = val_df.apply(lambda x: x['label'] in x['Top3_Predicted'], axis=1)
fine_tuned_top3_acc = val_df['Match_FineTuned'].mean()

print(f"🎯 Fine-tuned Top-3 Accuracy: {fine_tuned_top3_acc:.2%}")


🎯 Fine-tuned Top-3 Accuracy: 61.03%
