In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

In [4]:
# Load dataset
df = pd.read_csv("../tests/data/raw/archive.zip")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [10]:
df['Ticket Type'].unique()

array(['Technical issue', 'Billing inquiry', 'Cancellation request',
       'Product inquiry', 'Refund request'], dtype=object)

In [None]:
# Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip()

'hello'

In [8]:
df['full_ticket'] = df['Ticket Subject'] + ' ' + df['Ticket Description']
df['cleaned_text'] = df['full_ticket'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Ticket Type'])

df_new = df[['cleaned_text', 'label']]
# Split dataset
train_df, test_df = train_test_split(df_new, test_size=0.2, random_state=42)

In [9]:
train_df

Unnamed: 0,cleaned_text,label
2794,peripheral compatibility my productpurchased i...,1
7497,delivery problem im having an issue with the p...,1
7646,hardware issue ive forgotten my password for m...,0
2726,payment issue im having an issue with the prod...,2
4348,hardware issue im having an issue with the pro...,3
...,...,...
5734,data loss im having an issue with the productp...,4
5191,account access im having an issue with the pro...,0
5390,refund request im having an issue with the pro...,1
860,data loss im having an issue with the productp...,0


In [11]:
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [15]:
# convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6775 [00:00<?, ? examples/s]

Map: 100%|██████████| 6775/6775 [00:03<00:00, 1785.75 examples/s]
Map: 100%|██████████| 1694/1694 [00:01<00:00, 1271.65 examples/s]


In [16]:
from transformers import AutoModelForSequenceClassification

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5,  # billing, technical, cancel, refund, product
    id2label={i: label for i, label in enumerate(label_encoder.classes_)},
    label2id={label: i for i, label in enumerate(label_encoder.classes_)}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

In [24]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
# Start training
trainer.train()

In [None]:
# Evaluate
results = trainer.evaluate()
print(f"Final evaluation results: {results}")

# Save model
model.save_pretrained("./ticket_classifier")
tokenizer.save_pretrained("./ticket_classifier")

In [None]:
def predict_ticket_type(text):
    inputs = tokenizer(
        clean_text(text),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_id = torch.argmax(probs).item()
    return label_encoder.inverse_transform([pred_id])[0], probs.tolist()[0]

# Example usage
text = "I can't access my account after password reset"
predicted_label, probabilities = predict_ticket_type(text)
print(f"Predicted: {predicted_label} with probabilities {probabilities}")