# **Cell 1: Import thư viện**

In [1]:
import re
import pandas as pd
from pathlib import Path

# **Cell 2: Tiền xử lý Dataset - Consumer Reviews of Amazon Products**

In [2]:
# Load dataset
data = pd.read_csv('data/raw-data/1429_1.csv')  # Adjust path

# Step 1: Clean data
# Keep 'asins' column along with other relevant columns
data = data[['asins', 'name', 'categories', 'reviews.text', 'reviews.rating']].dropna()
data = data.drop_duplicates()

# Step 2: Extract categories (assuming 'categories' column exists)
# If categories are in a list format (e.g., "[Electronics, Headphones]"), parse them
data['categories'] = data['categories'].str.strip('[]').str.split(',').str[0]  # Take the first category
category_counts = data['categories'].value_counts()
print("Category counts:")
print(category_counts)

# Step 3: Select top 10 categories
top_10_categories = category_counts.head(10).index.tolist()
print("\nSelected 10 categories:")
print(top_10_categories)

# Filter data for selected categories
filtered_data = data[data['categories'].isin(top_10_categories)]

# Step 4: Save cleaned data
filtered_data.to_csv('data/preprocessed-data/cleaned_amazon_reviews.csv', index=False)
print("Cleaned data saved with 10 categories!")

  data = pd.read_csv('data/raw-data/1429_1.csv')  # Adjust path


Category counts:
categories
Fire Tablets                      11243
Stereos                            5948
Walmart for Business               3756
Electronics                        3071
Tablets                            1857
Computers/Tablets & Networking     1038
Amazon Devices & Accessories        401
Electronics Features                372
eBook Readers                        67
Computers & Tablets                  61
Kindle Store                         16
mazon.co.uk                          15
Categories                            8
Kindle E-readers                      6
Amazon Device Accessories             6
Name: count, dtype: int64

Selected 10 categories:
['Fire Tablets', 'Stereos', 'Walmart for Business', 'Electronics', 'Tablets', 'Computers/Tablets & Networking', 'Amazon Devices & Accessories', 'Electronics Features', 'eBook Readers', 'Computers & Tablets']
Cleaned data saved with 10 categories!


# **Cell 3: Tiền xử lý Dataset - Bitext Gen AI Chatbot Customer Support Dataset**

In [3]:
# Load Bitext dataset
bitext_data = pd.read_csv('data/raw-data/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

# Step 1: Check unique values in 'category' and 'intent'
print("Categories:", bitext_data['category'].unique())
print("Intents:", bitext_data['intent'].unique())

# Step 2: Filter data
# Only keep rows where category is related to e-commerce (adjust based on actual categories)

# Drop rows with missing instruction, intent, or response
bitext_data = bitext_data.dropna(subset=['instruction', 'intent', 'response'])

# Drop the 'flags' column as it's not needed
bitext_data = bitext_data.drop(columns=['flags'])

# Step 3: Normalize data
# Convert 'intent' to lowercase to avoid duplicates
bitext_data['intent'] = bitext_data['intent'].str.lower()
# Ensure 'instruction' and 'response' are strings
bitext_data['instruction'] = bitext_data['instruction'].astype(str)
bitext_data['response'] = bitext_data['response'].astype(str)

# Step 4: Save the cleaned dataset
bitext_data.to_csv('data/preprocessed-data/bitext_cleaned.csv', index=False)
print("Cleaned Bitext dataset saved to:",'./data/preprocessed-data/bitext_cleaned.csv')

Categories: ['ORDER' 'SHIPPING' 'CANCEL' 'INVOICE' 'PAYMENT' 'REFUND' 'FEEDBACK'
 'CONTACT' 'ACCOUNT' 'DELIVERY' 'SUBSCRIPTION']
Intents: ['cancel_order' 'change_order' 'change_shipping_address'
 'check_cancellation_fee' 'check_invoice' 'check_payment_methods'
 'check_refund_policy' 'complaint' 'contact_customer_service'
 'contact_human_agent' 'create_account' 'delete_account'
 'delivery_options' 'delivery_period' 'edit_account' 'get_invoice'
 'get_refund' 'newsletter_subscription' 'payment_issue' 'place_order'
 'recover_password' 'registration_problems' 'review'
 'set_up_shipping_address' 'switch_account' 'track_order' 'track_refund']
Cleaned Bitext dataset saved to: ./data/preprocessed-data/bitext_cleaned.csv


# Xem chi tiết các Placeholder

In [4]:
# Define a regex pattern to match placeholders including the {{}}
pattern = r'{{[^}]+}}'

# Extract all unique placeholders from the 'response' column
placeholders = set()
for response in bitext_data['response']:
    if isinstance(response, str):  # Ensure the value is a string
        matches = re.findall(pattern, response)
        placeholders.update(matches)

# Convert to a sorted list for better readability
unique_placeholders = sorted(list(placeholders))

# Print the unique placeholders
print("Unique custom placeholders found:")
for placeholder in unique_placeholders:
    print(placeholder)

Unique custom placeholders found:
{{Access Key Recovery}}
{{Access Key Reset Page URL}}
{{Access Key Retrieval}}
{{Access Key}}
{{Account Access Key Reset}}
{{Account Category}}
{{Account Change}}
{{Account Closure Process}}
{{Account Closure Timeframe}}
{{Account Details}}
{{Account ID}}
{{Account Key Recovery}}
{{Account Management}}
{{Account Name}}
{{Account Number}}
{{Account Page}}
{{Account Plan}}
{{Account Recovery Page URL}}
{{Account Recovery Page}}
{{Account Recovery}}
{{Account Security}}
{{Account Type Switch}}
{{Account Type}}
{{Account Upgrade}}
{{Account}}
{{Add a New Address}}
{{Basic Account}}
{{Basic}}
{{Billing Category}}
{{Billing History}}
{{Billing}}
{{Business Hours}}
{{Business Name Anonymized}}
{{Cancel Purchase}}
{{Cancellation Policy}}
{{Cancellation Refund Time}}
{{Carrier Name}}
{{Case Number}}
{{Change Access Key}}
{{Change Account}}
{{Change Key}}
{{Change PIN}}
{{Change Password}}
{{Change Profile}}
{{Change User}}
{{Choose Account Type}}
{{Choose the {