## Data Preparation

In [86]:
import pandas as pd
import json

# Read the CSV file into a DataFrame
df = pd.read_csv("hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")


In [87]:
df['category'].value_counts()

category
ACCOUNT         5986
ORDER           3988
REFUND          2992
CONTACT         1999
INVOICE         1999
PAYMENT         1998
FEEDBACK        1997
DELIVERY        1994
SHIPPING        1970
SUBSCRIPTION     999
CANCEL           950
Name: count, dtype: int64

In [88]:
# Select rows where the category is "ORDER"

# Select a random 10% of the filtered data
df = df.sample(frac=0.10, random_state=1)  # 10% sample
df = df[df['category'] == 'ORDER']



In [89]:
df.head()

Unnamed: 0,flags,instruction,category,intent,response
676,BQZ,I'm rtying to cancel order {{Order Number}},ORDER,cancel_order,I'm following the urgency of canceling your or...
25873,BQZ,i need help trackng order {{Order Number}},ORDER,track_order,We're here to help! I take note that you need ...
1158,BILZ,hw can I modify purchase {{Order Number}}?,ORDER,change_order,We appreciate your inquiry regarding the modif...
1755,BLMZ,hep changing some products of an order,ORDER,change_order,Appreciate your message to us for assistance w...
24908,BKL,show me estimated time of arrival of order {{O...,ORDER,track_order,Honored to assist! I'm clued in that you would...


In [90]:
df.shape

(418, 5)

In [91]:
df.to_csv("../Data/data.csv", index=False)

In [92]:
df.shape

(418, 5)

In [93]:
# Initialize an empty dictionary to hold the transformed data
data = []

# Group by 'category'
for category, group_df in df.groupby('category'):
    documents = []
    for _, row in df.iterrows():
        document = {
            "intent": row['intent'],
            "question": row['instruction'],
            "response": row['response']
        }
        documents.append(document)
    data.append(
        {
        "category": category,
        "documents": documents
    })

# Convert the dictionary to a list of values and then to a JSON file
with open('../Data/documents.json', 'w', encoding='utf-8') as jsonf:
    json.dump(data, jsonf, indent=2, ensure_ascii=False)

print("JSON file saved as documents.json")

JSON file saved as documents.json


In [94]:
import json

with open('../Data/documents.json', 'rt') as f_in:
    documents_raw = json.load(f_in)

In [95]:
documents = []

for category in documents_raw:
    category_name = category['category']

    for doc in category['documents']:
        doc['category'] = category_name
        documents.append(doc)

In [96]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['category']}-{doc['question']}-{doc['response'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [97]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [98]:
documents[3]

{'intent': 'change_order',
 'question': 'hep changing some products of an order',
 'response': "Appreciate your message to us for assistance with changing the products in your order. We understand that it's important for you to have the right items. To help you with this, could you please provide the order number and specify which products you would like to change? This way, we can ensure that your order is exactly as you want it. We're here to support you throughout the process, so please feel free to ask any questions or share any additional details that may help us fulfill your request.",
 'category': 'ORDER',
 'id': '85fcd90d'}

In [99]:
len(documents)

418

In [100]:
with open('../Data/documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)