In [1]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [93]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [94]:
dataset = load_dataset("csv", data_files={"train": "/content/sample_data/train_5v5GIB2.csv"})


In [95]:
# Initialize tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [96]:
# Combine text fields
def combine_text(examples):
    combined_text = [f"{subject} {body}" for subject, body in zip(examples['ticket_subject'], examples['ticket_body'])]
    return {'text': combined_text, 'department': examples['department']}

In [97]:
def process_labels(examples):
    # Mapping department names to numerical labels
    label_map = {
        'Technical Support': 0,
        'Customer Service': 1,
        'Billing and Payments': 2,
        'Product Support': 3,
        'IT Support': 4,
        'Returns and Exchanges': 5,
        'Sales and Pre-Sales': 6,
        'Human Resources': 7,
        'Service Outages and Maintenance': 8,
        'General Inquiry': 9
    }

    # Strip any leading/trailing whitespaces
    examples['department'] = [dept.strip() for dept in examples['department']]

    # Convert department names to numerical values using the label map
    examples['department'] = [label_map.get(dept, -1) for dept in examples['department']]  # Default to -1 if not found

    # Optional: Handle unexpected department values (for debugging purposes)
    for dept in examples['department']:
        if dept == -1:
            print(f"Unexpected department value found")

    return examples

In [98]:
# Tokenization function
def tokenize_function(examples):
    examples = combine_text(examples)
    examples = process_labels(examples)

    model_inputs = tokenizer(examples["text"], padding="max_length", truncation=True)
    model_inputs["labels"] = examples["department"]  # Add labels to the tokenized dataset
    return model_inputs

In [99]:
# Apply the processing to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Check processed dataset (optional)
print(tokenized_datasets['train'][:5])  # Verify if department column is now numerical

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

{'ticket_ID': [1001, 1002, 1003, 1004, 1005], 'ticket_subject': ['Discrepancia de facturación en Google Workspace', 'Urgent Consultation Request for Critical IT Issues', 'Consulta sobre Servicios de Consultoría en TI', 'Demande de mise à jour des dossiers', 'Issues with Slack connection affecting team communication today'], 'ticket_body': ['Estimado equipo de soporte de TI,\n\nEstoy escribiendo para informar un monto de facturación incorrecto en mi suscripción de Google Workspace Business Standard bajo la cuenta <acc_num>. Por favor, revise y ajuste la factura. Espero su pronta respuesta.\n\nSaludos,\n\n<name>', 'Dear IT Services Support Team, I hope this message finds you well. My name is <name> and I am currently experiencing critical issues with our server administration, which are significantly impacting our operations. We rely heavily on your IT Consulting Service for our ongoing technical needs. Therefore, we urgently need your expert assistance to resolve these issues as quickly

In [87]:
# Check the tokenized dataset
print(tokenized_datasets['train'][:5])

{'ticket_ID': [1001, 1002, 1003, 1004, 1005], 'ticket_subject': ['Discrepancia de facturación en Google Workspace', 'Urgent Consultation Request for Critical IT Issues', 'Consulta sobre Servicios de Consultoría en TI', 'Demande de mise à jour des dossiers', 'Issues with Slack connection affecting team communication today'], 'ticket_body': ['Estimado equipo de soporte de TI,\n\nEstoy escribiendo para informar un monto de facturación incorrecto en mi suscripción de Google Workspace Business Standard bajo la cuenta <acc_num>. Por favor, revise y ajuste la factura. Espero su pronta respuesta.\n\nSaludos,\n\n<name>', 'Dear IT Services Support Team, I hope this message finds you well. My name is <name> and I am currently experiencing critical issues with our server administration, which are significantly impacting our operations. We rely heavily on your IT Consulting Service for our ongoing technical needs. Therefore, we urgently need your expert assistance to resolve these issues as quickly

In [88]:
# Train/test split
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.2)
tokenized_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [89]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [92]:
print(set(tokenized_datasets['train']['department']))  # Check the unique labels in the dataset

{'Sales and Pre-Sales', 'General Inquiry', 'Customer Service', 'Returns and Exchanges', 'Product Support', 'Service Outages and Maintenance', 'Technical Support'}


In [90]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [91]:
# Train the model
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
