In [1]:
import re
import pandas as pd
import spacy  
from typing import Dict, Any, List
import json
from tqdm import tqdm

In [2]:
def extract_attributes(description: str) -> Dict[str, Any]:
    """
    Extracts product attributes comprehensively from a given description.
    """
    if pd.isna(description):  # Handle missing descriptions
        return {}

    # Initialize a comprehensive attributes dictionary
    attributes = {
        "name": None,
        "brand": None,
        "category": None,
        "type": None,
        "color": None,
        "material": [],
        "size": {
            "dimensions": None,
            "length": None,
            "width": None,
            "height": None,
            "display_size": None
        },
        "weight": {"value": None, "unit": None},
        "price": {"currency": None, "value": None, "mrp": None},
        "discount": None,
        "stock_availability": None,
        "target_audience": None,
        "gender": None,
        "specifications": {
            "processor": None,
            "ram": None,
            "storage": None,
            "battery": None,
            "camera": None,
            "display": None,
            "connectivity": []
        },
        "features": [],
        "rating": None,
        "reviews": None,
        "expiration_date": None,
        "energy_rating": None,
        "warranty": None
    }

    desc_lower = description.lower()

    # Comprehensive attribute extraction patterns
    patterns = {
        "name": r"product name: (\w[\w\s]+)",
        "color": r"\b(black|white|red|blue|green|yellow|purple|pink|gray|silver|gold|brown|orange|navy|maroon|turquoise|beige)\b",
        "material": r"\b(cotton|polyester|leather|silk|wool|nylon|metal|plastic|glass|ceramic|wood|rubber|aluminum|stainless steel|faux leather|mesh)\b",
        "brand": r"\b(apple|samsung|lg|sony|dell|hp|nike|adidas|lenovo|canon|fujifilm|bose|jbl)\b",
        "category": r"\b(smartphone|laptop|camera|headphones|shoes|clothing|furniture|kitchen|electronics|home appliance|watch)\b",
        "gender": r"\b(men|women|unisex|kids|boys|girls)\b",
        "stock": r"\b(in stock|out of stock|available|unavailable)\b",
        "target_audience": r"\b(adults|kids|children|teenagers|seniors|babies)\b",
        "discount": r"(\d{1,2})%\s+off",
        "display_size": r"(\d{1,2}\.?\d*)\s*(inch|inches)",
        "rating": r"(\d\.\d)\s*\/\s*5",
        "reviews": r"(\d+)\s*reviews?",
        "expiration_date": r"(?:expires|exp)\s*on\s*(\d{1,2}\/\d{1,2}\/\d{2,4})",
        "energy_rating": r"(\d{1,2})\s*stars?",
        "warranty": r"(\d{1,2})\s*(year|month)s?\s*warranty",
        "storage": r"(\d+)\s*(gb|tb)\s*storage",
        "battery": r"(\d+)\s*mah",
        "camera": r"(\d+)\s*mp\s*camera",
        "connectivity": r"(wifi|bluetooth|4g|5g|nfc|usb)",
        "price": r"(₹|rs\.?|\$)?\s*(\d+(?:,\d+)*(?:\.\d+)?)",
        "weight": r"(\d+(?:\.\d+)?)\s*(kg|g|grams|lbs|pounds)",
        "dimensions": r'(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*(?:x|×)\s*(\d+(?:\.\d+)?)\s*(cm|inches|m)'
    }

    # Extract attributes based on patterns
    for attr, pattern in patterns.items():
        match = re.search(pattern, desc_lower)
        if match:
            if attr in ["price", "weight", "dimensions"]:
                if attr == "price":
                    currency, value = match.groups()
                    attributes["price"] = {
                        "currency": currency or "₹",
                        "value": float(value.replace(",", ""))
                    }
                elif attr == "weight":
                    attributes["weight"] = {
                        "value": float(match.group(1)),
                        "unit": match.group(2).lower()
                    }
                elif attr == "dimensions":
                    length, width, height, unit = match.groups()
                    attributes["size"]["dimensions"] = {
                        "length": float(length),
                        "width": float(width),
                        "height": float(height),
                        "unit": unit
                    }
            else:
                if attr == "connectivity":
                    attributes["specifications"]["connectivity"].append(match.group(0).lower())
                elif attr in attributes:
                    attributes[attr] = match.group(0)
                else:
                    attributes["specifications"][attr] = match.group(0)

    # Remove empty values
    def remove_empty_values(attr):
        if isinstance(attr, dict):
            return {k: remove_empty_values(v) for k, v in attr.items() if v}
        elif isinstance(attr, list):
            return [remove_empty_values(i) for i in attr if i]
        return attr

    return remove_empty_values(attributes)

In [3]:
def process_product_descriptions(description_list: List[str]) -> List[Dict[str, Any]]:
    """
    Process a list of product descriptions and extract attributes for each, with progress bar.
    """
    return [extract_attributes(desc) for desc in tqdm(description_list, desc="Processing descriptions")]

In [4]:
def process_csv_to_json(input_csv_path: str, output_json_path: str):
    """
    Process a CSV file of product descriptions and save the results as a JSON file.
    """
    df = pd.read_csv(input_csv_path)
    descriptions = df['description'].dropna().tolist()
    extracted_data = process_product_descriptions(descriptions)

    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(extracted_data, json_file, ensure_ascii=False, indent=4)

In [5]:
# Example usage
input_csv_path = 'text_data.csv'
output_json_path = 'output_data.json'

In [6]:
process_csv_to_json(input_csv_path, output_json_path)

Processing descriptions: 100%|█████████████████████████████████████████████████| 19998/19998 [00:03<00:00, 6309.93it/s]
