In [None]:
"""
This script builds a HuggingFace dataset from the combined dataset.
"""

import csv
import json
import random
import unicodedata
from typing import Dict


def sanitize_row(row: Dict[str, str]) -> Dict[str, str]:
    """Sanitizes the data in a row to ensure it is valid UTF-8, normalized, and free of known ambiguous characters."""
    sanitized_row = {}
    for key, value in row.items():
        # Normalize to NFC form
        normalized_value = unicodedata.normalize("NFC", value)
        # Replace common ambiguous characters
        normalized_value = normalized_value.replace("“", '"').replace("”", '"')
        normalized_value = normalized_value.replace("‘", "'").replace("’", "'")
        normalized_value = normalized_value.replace(
            "—", "-"
        )  # Replace em-dash with hyphen
        # Encode to UTF-8 to ensure compatibility
        sanitized_row[key] = normalized_value.encode("utf-8", "replace").decode("utf-8")
    return sanitized_row


def read_and_transform_csv(input_file_path: str, output_json_path: str):
    data = []
    with open(input_file_path, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            sanitized_row = sanitize_row(row)
            try:
                # Parse the JSON string from the sanitized CSV to ensure it's valid and properly escaped
                output_json = json.loads(sanitized_row["json_response"])
                # Re-serialize to JSON string to ensure proper formatting and escaping
                output_json_string = json.dumps(output_json)
            except json.JSONDecodeError:
                # Handle cases where the JSON data is invalid
                output_json_string = "{}"  # Use an empty JSON object as a fallback

            # Create the conversation format for each entry
            conversation = {
                "conversations": [
                    {
                        "from": "system",

                        "value": "You are a smart assistant trained to detect buying intentions in a user's text. When given a user's input, you will respond in JSON format with three fields: \"buy_inten\", \"inten_level\", and \"cat\". \"buy_inten\" indicates whether there is a buying intention in the user's text. It should be 1 if there is a buying intention and 0 if there is not. \"inten_level\" represents the level of buying intention on a scale from 0 to 5, where 0 means no buying intention and 5 is the highest buying intention. \"cat\" specifies the category of the text, which could be any of the following: [\"Home & Kitchen\", \"Beauty & Personal Care\", \"Electronics\", \"Clothing, Shoes & Jewelry\", \"Toys & Games\", \"Health, Household & Baby Care\", \"Sports & Outdoors\", \"Pet Supplies\", \"Office Supplies\", \"Automotive\", \"General\"]. The category is selected as \"General\" when there is no exclusive category that can be found among others. For example, given the user's text: \"I'm looking for a new laptop,\" the JSON response should be: {\"buy_inten\": 1, \"inten_level\": 4, \"cat\": \"Electronics\"}. Ensure that your analysis is accurate and the JSON response strictly follows the specified format."
                    },
                    {"from": "human", "value": sanitized_row["text"]},
                    {"from": "gpt", "value": output_json_string},
                ]
            }
            data.append(conversation)

    # Randomize the order of data entries
    random.shuffle(data)

    with open(output_json_path, mode="w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=4)


if __name__ == "__main__":
    input_csv_path = "wop_json.csv"
    output_json_path = "wop_aigot_hf_dataset.json"
    read_and_transform_csv(input_csv_path, output_json_path)

In [None]:
import pandas as pd
import json

# Load the dataset
file_path = 'wop_dataset.csv'  # Change this to the path of your local CSV file
df = pd.read_csv(file_path)
# df['inten_level'] = (df['inten_level']/2).round()


output_file_path = 'wop_dataset.csv'
df.to_csv(output_file_path, index=False)

# Convert the inten_level column to integer
df['inten_level'] = df['inten_level'].astype(int)

# Create the new column with JSON format
df['json_response'] = df.apply(lambda row: json.dumps({
    "buy_inten": row['buy_inten'],
    "inten_level": row['inten_level'],
    "cat": row['cat']
}), axis=1)

# Save the updated DataFrame to a new CSV file
output_file_path = 'wop_json.csv'  # Change this to your desired output path
df.to_csv(output_file_path, index=False)

print("Updated CSV file created successfully.")

Updated CSV file created successfully.
