In [3]:
!pip install pandas scikit-learn datasets transformers




In [5]:
from google.colab import files

uploaded = files.upload()


Saving customer_support_tickets.csv to customer_support_tickets.csv


# Loading Dataset

In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv('customer_support_tickets.csv')

df.head()



Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [7]:

df = df[['Ticket ID', 'Ticket Description', 'Ticket Type']]

# Drop missing values in important columns
df = df.dropna(subset=['Ticket Description', 'Ticket Type'])

# Check unique ticket types
print("Unique ticket types:", df['Ticket Type'].unique())


Unique ticket types: ['Technical issue' 'Billing inquiry' 'Cancellation request'
 'Product inquiry' 'Refund request']


In [8]:
from transformers import pipeline

# Candidate labels = all unique ticket types
candidate_labels = list(df['Ticket Type'].unique())

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Pick an example ticket to test
text = df['Ticket Description'].iloc[0]

result = classifier(text, candidate_labels, multi_label=True)

print("Ticket description:", text)
print("\nTop 3 predicted types:")
for label, score in zip(result['labels'][:3], result['scores'][:3]):
    print(f"{label}: {score:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


Ticket description: I'm having an issue with the {product_purchased}. Please assist.

Your billing zip code is: 71701.

We appreciate that you have requested a website address.

Please double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists.

Top 3 predicted types:
Technical issue: 0.87
Billing inquiry: 0.84
Product inquiry: 0.82


In [9]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Encode target labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['Ticket Type'])

# Split train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

print(f"Train: {len(train_df)}, Test: {len(test_df)}")

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


Train: 6775, Test: 1694


In [10]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["Ticket Description"], padding="max_length", truncation=True)

# Tokenize
train_ds = train_ds.map(preprocess_function, batched=True)
test_ds = test_ds.map(preprocess_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/6775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1694 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
!pip install -U transformers




In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=None   # <-- this disables wandb / tensorboard / comet
)


In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[]   # <-- disables wandb / tensorboard / comet
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
10,1.6601
20,1.6047
30,1.659
40,1.6241
50,1.6276
60,1.5926
70,1.6142
80,1.6149
90,1.637
100,1.642


TrainOutput(global_step=2541, training_loss=1.6155190438747593, metrics={'train_runtime': 1973.623, 'train_samples_per_second': 10.298, 'train_steps_per_second': 1.287, 'total_flos': 5347876245580800.0, 'train_loss': 1.6155190438747593, 'epoch': 3.0})

In [15]:
import numpy as np
from sklearn.metrics import classification_report

# Get predictions on test set
outputs = trainer.predict(test_ds)

# Raw logits from model
logits = outputs.predictions

# Convert logits to predicted label indices
pred_labels = np.argmax(logits, axis=1)

# True labels
true_labels = np.array(test_df['label'])

# Print accuracy, precision, recall, F1
print(classification_report(true_labels, pred_labels, target_names=le.classes_))


                      precision    recall  f1-score   support

     Billing inquiry       0.00      0.00      0.00       327
Cancellation request       0.00      0.00      0.00       339
     Product inquiry       0.00      0.00      0.00       328
      Refund request       0.15      0.05      0.07       351
     Technical issue       0.21      0.94      0.34       349

            accuracy                           0.20      1694
           macro avg       0.07      0.20      0.08      1694
        weighted avg       0.07      0.20      0.08      1694



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
