In [1]:
!pip install kagglehub transformers scikit-learn pandas



In [2]:
import kagglehub
import pandas as pd
from pathlib import Path
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score

In [3]:
#Step 1 Download Dataset
path = kagglehub.dataset_download("suraj520/customer-support-ticket-dataset")
print("Dataset downloaded at:", path)

Dataset downloaded at: C:\Users\YC\.cache\kagglehub\datasets\suraj520\customer-support-ticket-dataset\versions\1


In [4]:
# Step 2: Locate CSV file inside downloaded folder
p = Path(path)
csv_files = list(p.rglob("*.csv"))
if not csv_files:
    raise FileNotFoundError("No CSV files found in dataset folder.")
csv_path = csv_files[0]
print("Using CSV file:", csv_path)

Using CSV file: C:\Users\YC\.cache\kagglehub\datasets\suraj520\customer-support-ticket-dataset\versions\1\customer_support_tickets.csv


In [5]:
# Step 3: Load dataset
df = pd.read_csv(csv_path)
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(3))

Dataset shape: (8469, 17)
Columns: ['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age', 'Customer Gender', 'Product Purchased', 'Date of Purchase', 'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status', 'Resolution', 'Ticket Priority', 'Ticket Channel', 'First Response Time', 'Time to Resolution', 'Customer Satisfaction Rating']
   Ticket ID        Customer Name              Customer Email  Customer Age  \
0          1        Marisa Obrien  carrollallison@example.com            32   
1          2         Jessica Rios    clarkeashley@example.com            42   
2          3  Christopher Robbins   gonzalestracy@example.com            48   

  Customer Gender Product Purchased Date of Purchase      Ticket Type  \
0           Other        GoPro Hero       2021-03-22  Technical issue   
1          Female       LG Smart TV       2021-05-22  Technical issue   
2           Other          Dell XPS       2020-07-14  Technical issue   

             Ticket Subject  \

In [6]:
#step 4 print columns 
print(df.columns.tolist())

['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age', 'Customer Gender', 'Product Purchased', 'Date of Purchase', 'Ticket Type', 'Ticket Subject', 'Ticket Description', 'Ticket Status', 'Resolution', 'Ticket Priority', 'Ticket Channel', 'First Response Time', 'Time to Resolution', 'Customer Satisfaction Rating']


In [7]:
#step 5 Assign text and label columns 
text_col = "Ticket Description" 
label_col = "Ticket Type"        

print(f"Using text column: {text_col}")
print(f"Using label column: {label_col}")


Using text column: Ticket Description
Using label column: Ticket Type


In [8]:
# Step 6: Prepare candidate labels (top 10 frequent labels)
label_counts = df[label_col].value_counts()
labels = label_counts.head(10).index.tolist()
print(f"Using labels (top 10): {labels}")

Using labels (top 10): ['Refund request', 'Technical issue', 'Cancellation request', 'Product inquiry', 'Billing inquiry']


In [9]:
# Step 7: Initialize zero-shot classification pipeline
model_name = "typeform/distilbert-base-uncased-mnli"  # smaller, faster on CPU
classifier = pipeline("zero-shot-classification", model=model_name)

config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/258 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [10]:
# Step 8: Sample subset for evaluation (optional)
eval_df = df[[text_col, label_col]].dropna().sample(min(300, len(df)), random_state=42)

top1_preds = []
y_true = []
top3_hits = 0

print("Running zero-shot classification on sample data...")

for idx, row in eval_df.iterrows():
    text = row[text_col]
    true_label = row[label_col]
    result = classifier(text, candidate_labels=labels, multi_label=True)
    
    ranked_labels = [label for label, score in sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True)]
    
    top1_preds.append(ranked_labels[0])
    y_true.append(true_label)
    
    if true_label in ranked_labels[:3]:
        top3_hits += 1

acc_top1 = accuracy_score(y_true, top1_preds)
f1_macro = f1_score(y_true, top1_preds, average="macro")
acc_top3 = top3_hits / len(eval_df)

print(f"Top-1 Accuracy: {acc_top1:.4f}")
print(f"Macro F1 Score (Top-1): {f1_macro:.4f}")
print(f"Top-3 Accuracy: {acc_top3:.4f}")


Running zero-shot classification on sample data...
Top-1 Accuracy: 0.1900
Macro F1 Score (Top-1): 0.1016
Top-3 Accuracy: 0.6167


In [12]:
# Step 9: Function to predict top 3 tags for new ticket text
def predict_top3_tags(text):
    out = classifier(text, candidate_labels=labels, multi_label=True)
    sorted_labels = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1], reverse=True)
    return {label: float(score) for label, score in sorted_labels[:3]}

In [13]:
# Example usage
sample_text = "I am unable to login to my account after the password reset."
print(f"\nSample text: {sample_text}")
print("Predicted top 3 tags:", predict_top3_tags(sample_text))


Sample text: I am unable to login to my account after the password reset.
Predicted top 3 tags: {'Technical issue': 0.5089683532714844, 'Refund request': 0.08350994437932968, 'Cancellation request': 0.020151687785983086}
