# Loading required libraries

In [70]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
import string
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import gpt3_tokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Reading the dataset

In [71]:
# Load and preprocess the dataset
file_path = 'hf://datasets/Kaludi/Customer-Support-Responses/Customer-Support.csv'
df = pd.read_csv(file_path)

'(ProtocolError('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)), '(Request ID: c790515a-08a0-42f4-bb26-b369c02adfce)')' thrown while requesting GET https://huggingface.co/datasets/Kaludi/Customer-Support-Responses/resolve/main/Customer-Support.csv
Retrying in 1s [Retry 1/5].


# Exploratory Data Analysis

In [72]:
# Display some basic information about the dataset
print(df)

# Define a function to tokenize the dialogues
def gpt3_tokenize(text):
    tokens = gpt3_tokenizer.encode(text)
    return tokens  # Return token IDs

# Test the tokenizer function
example_text = "Hello, how are you?"
tokens = gpt3_tokenize(example_text)
print(tokens)

text = " ".join(review for review in df['response'])

# Tokenize the text
tokens = word_tokenize(text)

# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
tokens = [token.lower() for token in tokens if token not in stop_words and token not in string.punctuation]

# Save the preprocessed responses as a JSON file
with open('data/response_token.json', 'a') as json_file:
    json.dump(tokens, json_file)

text = " ".join(review for review in df['query'])

# Tokenize the text
tokens = word_tokenize(text)

# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
tokens = [token.lower() for token in tokens if token not in stop_words and token not in string.punctuation]
# Save the preprocessed query as a JSON file
with open('data/query_token.json', 'a') as json_file:
    json.dump(tokens, json_file)

                                               query  \
0                       My order hasn't arrived yet.   
1                      I received a damaged product.   
2                          I need to return an item.   
3              I want to change my shipping address.   
4                   I have a question about my bill.   
..                                               ...   
69  How do I schedule a consultation or appointment?   
70                   Can I get a copy of my receipt?   
71    Can I use a competitor's coupon at your store?   
72                  Do you have a recycling program?   
73       How do I report a lost or stolen gift card?   

                                             response  
0   We apologize for the inconvenience. Can you pl...  
1   We apologize for the inconvenience. Can you pl...  
2   Certainly. Please provide your order number an...  
3   No problem. Can you please provide your order ...  
4   We'd be happy to help. Can you please provi

# Data Preprocessing and tokenization

In [73]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['query'] = df['query'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

# Adding special tokens for clarity
df['text'] = df['query'] + " <|endoftext|> " + df['response']

# training and validation dataset, keeping 20% validation dataset
train_df, val_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
train_texts = train_df['text'].tolist()
val_texts = val_df['text'].tolist()

# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Training the model

In [74]:
class CustomerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CustomerDataset(train_encodings)
val_dataset = CustomerDataset(val_encodings)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=50,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

Step,Training Loss
10,7.3571
20,6.5569
30,5.1772
40,3.2346
50,1.7672
60,1.1712
70,1.0249
80,0.8264
90,0.7213
100,0.6445


TrainOutput(global_step=150, training_loss=2.0498246399561566, metrics={'train_runtime': 409.2503, 'train_samples_per_second': 0.721, 'train_steps_per_second': 0.367, 'total_flos': 19270287360000.0, 'train_loss': 2.0498246399561566, 'epoch': 5.0})

# Evaluate the model

In [75]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

# Function to generate responses with attention mask and adjusted parameters
def generate_response(model, tokenizer, query, max_length=128, max_new_tokens=50, temperature=0.7, top_k=50, top_p=0.9, do_sample=True):
    inputs = tokenizer.encode_plus(query + " <|endoftext|>", return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split('<|endoftext|>')[-1].strip().replace(query,"")

# Generate responses for some sample queries
sample_queries = [
    "i have a question about my bill.",
    "can i get a replacement part for my product?",
    "can i place a bulk order?",
    "my order hasn't arrived yet.",
    "i can't log into my account.",
    "How do I report a problem with your website?",
    "I am not able to access your website"
]

for query in sample_queries:
    response = generate_response(model, tokenizer, query)
    print(f"Query: {query}\nResponse: {response}\n")

Perplexity: 1.4876656532287598
Query: i have a question about my bill.
Response:   we'd be happy to help. can you please provide your product name, sku, or zip code so we can check the availability?

Query: can i get a replacement part for my product?
Response:   we'd be happy to help. can you please provide the product name or sku so we can determine the best fit for you?

Query: can i place a bulk order?
Response:   we can't guarantee that your order will be processed or shipped. can you please provide the product name or sku so we can send you an estimate?

Query: my order hasn't arrived yet.
Response:   we apologize for the inconvenience. can you please provide the product name, sku, or sku so we can send you instructions on how to proceed?

Query: i can't log into my account.
Response:   we apologize for the inconvenience. can you please provide your email address so we can send you an update?

Query: How do I report a problem with your website?
Response:   we'd be happy to help. 

# Demo

In [76]:
# Jupyter notebook cell to create a simple demo
import ipywidgets as widgets
from IPython.display import display

#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
#model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the function to handle the query and generate response
def on_button_click(b):
    query = query_input.value
    response = generate_response(model, tokenizer, query)
    response_output.value = f"Response: {response}"

# Create input and output widgets
query_input = widgets.Textarea(
    value='',
    placeholder='Type your query here',
    description='Query:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='100px')
)

response_output = widgets.Textarea(
    value='',
    placeholder='Response will appear here',
    description='',
    disabled=True,
    layout=widgets.Layout(width='100%', height='100px')
)

# Create a button widget
button = widgets.Button(
    description='Get Response',
    disabled=False,
    button_style='info',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to get response',
    icon='check'  # (FontAwesome names without the `fa-` prefix)
)

# Assign the button click event
button.on_click(on_button_click)

# Display the widgets
display(query_input, button, response_output)


Textarea(value='', description='Query:', layout=Layout(height='100px', width='100%'), placeholder='Type your q…

Button(button_style='info', description='Get Response', icon='check', style=ButtonStyle(), tooltip='Click to g…

Textarea(value='', disabled=True, layout=Layout(height='100px', width='100%'), placeholder='Response will appe…