<a href="https://colab.research.google.com/github/bala-ceg/AI-Customer-Support-Assistant/blob/main/AI_Customer_Support_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Necessary Libraries

In [1]:
!pip install transformers datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install evaluate



### Import necessary Libraries



In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
import torch
import evaluate

### Download and load the dataset

In [3]:
dataset = load_dataset("Kaludi/Customer-Support-Responses")
dataset = dataset['train'].train_test_split(test_size=0.2)

# Rename the splits to 'train' and 'validation'
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']
})

print(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['query', 'response'],
        num_rows: 59
    })
    validation: Dataset({
        features: ['query', 'response'],
        num_rows: 15
    })
})


In [4]:
print(dataset['train'][0])
print(dataset['validation'][0])

{'query': 'Can I place a custom order?', 'response': "We'd be happy to assist you. Can you please provide the product name or SKU and a description of the customizations you'd like?"}
{'query': 'How do I use a gift card in-store?', 'response': 'To use a gift card in-store, simply present the gift card at the time of purchase. Can you please provide the gift card number so we can check the balance for you?'}


In [5]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

# Define the preprocessing function
def preprocess_function(examples):
    inputs = examples['query']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['response'], max_length=128, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

print("Tokenized datasets:", tokenized_datasets)



Map:   0%|          | 0/59 [00:00<?, ? examples/s]



Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['query', 'response', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 59
    })
    validation: Dataset({
        features: ['query', 'response', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15
    })
})


In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments


# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')

# Define the data collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,11.179404
2,No log,10.468457
3,No log,10.266939


TrainOutput(global_step=12, training_loss=11.640157063802084, metrics={'train_runtime': 10.171, 'train_samples_per_second': 17.402, 'train_steps_per_second': 1.18, 'total_flos': 13490419138560.0, 'train_loss': 11.640157063802084, 'epoch': 3.0})

In [10]:
from datasets import load_metric

metric = load_metric("sacrebleu")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    return result

trainer.evaluate()

The repository for sacrebleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sacrebleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


{'eval_loss': 10.266939163208008,
 'eval_runtime': 0.2711,
 'eval_samples_per_second': 55.34,
 'eval_steps_per_second': 3.689,
 'epoch': 3.0}

In [9]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.0-py3-none-any.whl (18 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.10.0 sacrebleu-2.4.2
