# Fine-Tune a Generative AI Model for Query Response

## Install the required packages for the LLM and datasets.

In [2]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2  --quiet

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


### Import the necessary components.

In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

### Load Dataset and LLM

In [4]:
from datasets import DatasetDict

huggingface_dataset_name = "Kaludi/Customer-Support-Responses"

ds = load_dataset(huggingface_dataset_name)

ds_train_devtest = ds['train'].train_test_split(test_size=0.2, seed=42)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.5, seed=42)
dataset = DatasetDict({
    'train': ds_train_devtest['train'],
    'valid': ds_devtest['train'],
    'test': ds_devtest['test']
})
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'response'],
        num_rows: 59
    })
    valid: Dataset({
        features: ['query', 'response'],
        num_rows: 7
    })
    test: Dataset({
        features: ['query', 'response'],
        num_rows: 8
    })
})

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



## Perform Full Fine-Tuning

### 2.1 - Preprocess the Customer Support Dataset

In [6]:
def tokenize_function(example):
    start_prompt = 'Create response to the query. \n\nQuery:'
    end_prompt = '\n\nResponse: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["query"]]
    example['input_ids'] = tokenizer(prompt, max_length=200, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["response"], max_length=200, padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['query', 'response',])

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

To save some time in the lab, you will subsample the dataset:

In [7]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/7 [00:00<?, ? examples/s]

Check the shapes of all three parts of the dataset:

In [8]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['valid'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (1, 2)
Validation: (1, 2)
Test: (1, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
    valid: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
})


The output dataset is ready for fine-tuning.

### Fine-Tune the Model with the Preprocessed Dataset


In [9]:
output_dir = f'./customer-support-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=50
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid']
)

Start training process...

In [10]:
trainer.train()



Step,Training Loss
1,43.25
2,41.0
3,42.5
4,41.5
5,48.0
6,45.0
7,37.5
8,42.5
9,40.25
10,38.5


TrainOutput(global_step=50, training_loss=40.48, metrics={'train_runtime': 176.7178, 'train_samples_per_second': 2.263, 'train_steps_per_second': 0.283, 'total_flos': 13374167040000.0, 'train_loss': 40.48, 'epoch': 50.0})

In [11]:
trainer.save_model(output_dir)

Create an instance of the `AutoModelForSeq2SeqLM` class for the finetuned model:

In [12]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)

### Evaluate the Model Qualitatively (Human Evaluation)

In [13]:
index = 7
query = dataset['test'][index]['query']
human_baseline_response = dataset['test'][index]['response']
dash_line = '-'.join('' for x in range(100))

prompt = f"""
Create response to the query.

Query:
{query}

Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print("Query: ",query)
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_response}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

Query:  Is there a mobile app for your store?
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Yes, we do have a mobile app. Can you please provide your email address so we can send you a link to download the app and instructions on how to use it?
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Yes, there is a mobile app for your store.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Yes, there is a mobile app for your store.


### Evaluate the Model Quantitatively (with ROUGE Metric)

In [14]:
rouge = evaluate.load('rouge')

Generate the outputs for the sample of the test dataset (only 10 dialogues and summaries to save time), and save the results.

In [15]:
cust_resp = dataset['test'][0:10]['query']
human_baseline_response = dataset['test'][0:10]['response']

original_model_response = []
instruct_model_response = []

for _, dialogue in enumerate(cust_resp):
    prompt = f"""
Create response to the query.

Query:
{dialogue}

Response: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_response.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_response.append(instruct_model_text_output)
    
zipped_summaries = list(zip(human_baseline_response, original_model_response, instruct_model_response))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_response', 'original_model_response', 'instruct_model_response'])
df

Unnamed: 0,human_baseline_response,original_model_response,instruct_model_response
0,We apologize for the inconvenience. Can you pl...,I'm sorry.,I'm sorry.
1,"Yes, we do have a loyalty program. Can you ple...",Yes,"Yes, I have a loyalty program."
2,We'd be happy to provide information on our pr...,I have a policy on price adjustments.,The policy is to adjust the price of a product...
3,Thank you for your interest. Can you please pr...,a store credit card,You can apply for a store credit card by visit...
4,Certainly. Can you please provide the product ...,"No problem, I'd like to order a pair of shoes.","Yes, I can."
5,We'd be happy to help. Can you please provide ...,The user manual is located on the user manual ...,The user manual is available at the following ...
6,We'd be happy to help. Can you please provide ...,What is the difference between a dollar and a ...,What is the difference between a dollar and a ...
7,"Yes, we do have a mobile app. Can you please p...","Yes, there is a mobile app for your store.","Yes, there is a mobile app for your store."


Evaluate the models computing ROUGE metrics. Notice the improvement in the results!

In [17]:
original_model_results = rouge.compute(
    predictions=original_model_response,
    references=human_baseline_response[0:len(original_model_response)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_response,
    references=human_baseline_response[0:len(instruct_model_response)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.11549423988448379, 'rouge2': 0.03424908424908425, 'rougeL': 0.1101760777959264, 'rougeLsum': 0.1111576733527953}
INSTRUCT MODEL:
{'rouge1': 0.19038095835693442, 'rouge2': 0.0566785287833082, 'rougeL': 0.16308584409373506, 'rougeLsum': 0.16308584409373506}


The results show substantial improvement in all ROUGE metrics:

In [18]:
print("Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL
rouge1: 7.49%
rouge2: 2.24%
rougeL: 5.29%
rougeLsum: 5.19%


In [None]:
### TEST a new query output of user choice!!!!

In [None]:
#### ENTER YOUR TEXT HERE ######
query = "Enter your query here!!!!!"



dash_line = '-'.join('' for x in range(100))

prompt = f"""
Create response to the query.

Query:
{query}

Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
model_text_output = tokenizer.decode(model_outputs[0], skip_special_tokens=True)

print("Query: ",response)
print(dash_line)
print(f'FINE-TUNED MODEL OUTPUT:\n{model_text_output}')