In [1]:
import re
import os
import json
import pandas as pd
from typing import Dict, Any

In [2]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize the filename by replacing characters that are not allowed in filenames.
    """
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

In [3]:
def normalize_text(text: str) -> str:
    """
    Normalize text by:
    1. Removing spaced-out characters 
    2. Removing excess spaces
    3. Converting to lowercase
    """
    # Remove spurious character spacing
    text = re.sub(r'(\w)\s+(\w)', r'\1\2', text)

    # Clean up redundant spaces
    return re.sub(r'\s+', ' ', text).strip().lower()

In [4]:
def extract_attributes(description: str) -> Dict[str, Any]:
    """
    Extract structured attributes from the product description.
    """
    if pd.isna(description) or not description.strip():
        return {}

    description = normalize_text(description)

    attributes = {
        "name": None,
        "brand": None,
        "category": None,
        "color": None,
        "material": [],
        "size": {
            "dimensions": None,
            "length": None,
            "width": None,
            "height": None
        },
        "weight": {"value": None, "unit": None},
        "price": {"currency": None, "value": None, "mrp": None},
        "specifications": {
            "processor": None,
            "ram": None,
            "storage": None,
            "battery": None,
            "display": None,
            "connectivity": []
        },
        "features": [],
        "rating": None,
        "reviews": None,
        "warranty": None
    }

    # Patterns for extraction
    patterns = {
        "color": r"\b(black|white|red|blue|green|yellow|pink|gray|gold|brown|orange|beige|silver)\b",
        "material": r"\b(cotton|leather|plastic|metal|glass|stainless steel|polyester)\b",
        "dimensions": r"(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*(cm|inch|inches)",
        "weight": r"(\d+\.?\d*)\s*(kg|g|grams|lbs|pounds)",
        "price": r"(₹|\$|€)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d+)?)",
        "processor": r"(snapdragon|exynos|mediatek|intel core i\d|apple m\d)",
        "ram": r"(\d+)\s*gb\s*ram",
        "storage": r"(\d+)\s*(gb|tb)\s*storage",
        "battery": r"(\d+)\s*mah",
        "display": r"(\d+\.\d+|\d+)\s*(inch|inches|cm)",
        "connectivity": r"(wifi|bluetooth|4g|5g|nfc|usb)",
        "warranty": r"(\d+)\s*(year|month)s?\s*warranty"
    }

    # Extract attributes
    for attr, pattern in patterns.items():
        matches = re.findall(pattern, description, re.IGNORECASE)
        if matches:
            match = matches[0]
            if attr == "dimensions":
                length, width, height, unit = match
                attributes["size"]["dimensions"] = {
                    "length": float(length),
                    "width": float(width),
                    "height": float(height),
                    "unit": unit.lower()
                }
            elif attr == "weight":
                attributes["weight"] = {"value": float(match[0]), "unit": match[1].lower()}
            elif attr == "price":
                currency, value = match
                # Prefer currency if found, otherwise default to '₹'
                attributes["price"] = {
                    "currency": currency.strip() or "₹", 
                    "value": float(value.replace(",", ""))
                }
            elif attr in attributes["specifications"]:
                # Join in case of multiple matches
                attributes["specifications"][attr] = match[0] if isinstance(match, tuple) else match
            elif attr == "connectivity":
                attributes["specifications"]["connectivity"].extend(match if isinstance(match, tuple) else [match])
            elif attr == "color":
                # Get the first color match
                attributes[attr] = match[0] if isinstance(match, tuple) else match
            else:
                attributes[attr] = match[0] if isinstance(match, tuple) else match

    # Features extraction with more flexibility
    features_keywords = ['self-cleaning', 'wireless', 'fast charging', 'bluetooth', 'touchscreen']
    attributes["features"] = [feature for feature in features_keywords if feature in description]

    # Clean attributes by removing empty or null values
    def clean_attributes(attr: Any) -> Any:
        if isinstance(attr, dict):
            return {k: clean_attributes(v) for k, v in attr.items() if v not in [None, {}, [], ""]}
        elif isinstance(attr, list):
            return [clean_attributes(v) for v in attr if v not in [None, {}, [], ""]]
        return attr

    return clean_attributes(attributes)


In [5]:
def process_json_files_in_folder(input_folder: str, output_folder: str):
    """
    Process each JSON file in a folder, extract attributes, and save the results.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.json'):
            input_file_path = os.path.join(input_folder, file_name)

            try:
                with open(input_file_path, 'r', encoding='utf-8') as json_file:
                    data = json.load(json_file)

                product_name = data.get("product_name", "Unnamed Product")
                description = data.get("description", "")

                # Extract attributes
                attributes = extract_attributes(description)
                attributes["product_name"] = product_name

                sanitized_name = sanitize_filename(product_name)
                output_file_path = os.path.join(output_folder, f"{sanitized_name}.json")

                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    json.dump(attributes, output_file, ensure_ascii=False, indent=4)

                print(f"Processed and saved: {output_file_path}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

In [6]:
# Example usage
input_folder_path = 'json_test_files'  # Update with actual folder path
output_folder_path = 'json_output_results'  # Update with output folder path

In [7]:
process_json_files_in_folder(input_folder_path, output_folder_path)

Processed and saved: json_output_results\Air Max 270.json
Processed and saved: json_output_results\Anker PowerCore 20100mAh Portable Charger.json
Processed and saved: json_output_results\Apple iPad Air (2022).json
Processed and saved: json_output_results\Aqualens lens cleaner.json
Processed and saved: json_output_results\Artisan Stand Mixer.json
Processed and saved: json_output_results\Atomberg SL1 Smart Door Lock.json
Processed and saved: json_output_results\Baggit Men's Zip Around Wallet.json
Processed and saved: json_output_results\Bodylovin Vanilla Vibes Body Butter.json
Processed and saved: json_output_results\Bose QuietComfort 45 Headphones.json
Processed and saved: json_output_results\Boult 10000 mAh 22.5 W Power Bank.json
Processed and saved: json_output_results\Breville BES870XL Barista Express Espresso Machine.json
Processed and saved: json_output_results\Canon EOS R10 Mirrorless Camera Body.json
Processed and saved: json_output_results\Canon MF272dw Multi-function WiFi Monoc