# Setup & Data loading

In [1]:
# Setup & Data Loading
import json
import re
import html
from pathlib import Path
from typing import Any, Dict, List, Tuple

import pandas as pd
from bs4 import BeautifulSoup


def load_data(file_path: str | Path) -> pd.DataFrame:
    """loads the raw dataset into a DataFrame or list.
    Supports JSON array or JSONL inputs.
    """
    p = Path(file_path)
    with p.open('r', encoding='utf-8') as f:
        first = f.read(1)
        f.seek(0)
        if first == '[':
            data = json.load(f)
        else:
            data = [json.loads(line) for line in f if line.strip()]
    return pd.DataFrame(data)


def inspect_data(df: pd.DataFrame) -> None:
    """shows basic info and samples to understand structure."""
    print('Shape:', df.shape)
    print('Columns:', list(df.columns))
    display(df.head(3))
    display(df.sample(min(3, len(df))))


# Text Cleaning and Normalization

In [2]:
def clean_text(text: str) -> str:
    """Cleans and normalizes raw text fields.
    Steps:
    - Unescape HTML entities (&amp;, &lt;, etc.)
    - Remove HTML tags
    - Lowercase all text
    - Remove URLs, punctuation, digits, and special symbols
    - Collapse multiple spaces
    Returns a clean text string.
    """
    if not isinstance(text, str) or not text.strip():
        return ""

    # Unescape HTML and strip tags
    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove punctuation, digits, and non-alphabetic characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Tokenization & Text Processing

In [3]:
# Part 3 – Tokenization & Text Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Make sure NLTK resources are available (run once)
nltk.download('punkt')
nltk.download('stopwords')

STOP_WORDS = set(stopwords.words('english'))
STEMMER = SnowballStemmer('english')


def tokenize_text(text: str) -> List[str]:
    """Splits text into individual tokens (words)."""
    if not isinstance(text, str) or not text.strip():
        return []
    return word_tokenize(text)


def remove_stopwords(tokens: List[str]) -> List[str]:
    """Removes common stopwords from token list."""
    return [t for t in tokens if t not in STOP_WORDS]


def stem_tokens(tokens: List[str]) -> List[str]:
    """Applies stemming to reduce tokens to their root form."""
    return [STEMMER.stem(t) for t in tokens]


def preprocess_text(text: str) -> List[str]:
    """Applies full text preprocessing pipeline:
    - Clean text (using clean_text)
    - Tokenize
    - Remove stopwords
    - Stem tokens
    Returns a list of processed tokens.
    """
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taniapazospuig/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taniapazospuig/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Record-Level Preprocessing

In [None]:
# Record-Level Preprocessing

def preprocess_product_details(details: Any) -> str:
    """flattens and cleans the list of product attributes."""
    if isinstance(details, list):
        parts = []
        for item in details:
            if isinstance(item, dict):
                for k, v in item.items():
                    parts.append(f"{k}: {v}")
            else:
                parts.append(str(item))
        return clean_text(' '.join(parts))
    return clean_text(str(details))

# Don't think it's okay, the fields are not the ones appearing in the dataset
# num_reviews not what's asked in the project
def normalize_numeric_fields(record: Dict[str, Any]) -> Dict[str, Any]:
    """converts numeric string fields to proper types for search and ranking."""
    rec = dict(record)
    
    # Convert price fields
    for key in ('selling_price', 'actual_price'):
        if key in rec and rec[key] is not None:
            val = rec[key]
            if isinstance(val, (int, float)):
                continue
            s = str(val).replace(',', '').strip()  # remove thousands separators
            s = re.sub(r"[^0-9.\-]", "", s)       # keep only digits, dot, minus
            try:
                rec[key] = float(s) if '.' in s else int(s)
            except Exception:
                rec[key] = None
    
    # Convert discount field (e.g., "69% off" → 69)
    if 'discount' in rec and rec['discount'] is not None:
        s = str(rec['discount'])
        s = re.sub(r"[^0-9]", "", s)
        try:
            rec['discount'] = int(s)
        except Exception:
            rec['discount'] = None
    
    # Convert average_rating field
    if 'average_rating' in rec and rec['average_rating'] is not None:
        val = rec['average_rating']
        try:
            rec['average_rating'] = float(str(val).replace(',', '.').strip())
        except Exception:
            rec['average_rating'] = None
    
    # Ensure out_of_stock is boolean
    if 'out_of_stock' in rec:
        rec['out_of_stock'] = bool(rec['out_of_stock'])
    
    return rec

# what about the seller, brand, pid, out_of_stock, url?
def preprocess_record(record: Dict[str, Any]) -> Dict[str, Any]:
    """applies all cleaning to one product entry."""
    rec = normalize_numeric_fields(record)
    title = clean_text(rec.get('title', ''))
    description = clean_text(rec.get('description', ''))
    category = clean_text(rec.get('category', ''))
    subcategory = clean_text(rec.get('subcategory', ''))
    details = preprocess_product_details(rec.get('details') or rec.get('product_details') or [])

    full_text = ' '.join([title, description, category, subcategory, details]).strip()
    tokens = preprocess_text(full_text)

    rec['title_clean'] = title
    rec['description_clean'] = description
    rec['category_clean'] = category
    rec['subcategory_clean'] = subcategory
    rec['details_clean'] = details
    rec['tokens'] = tokens # Consider creating title_tokens and description_tokens separately too if you want field-level retrieval weighting
    return rec


def preprocess_corpus(df: pd.DataFrame) -> pd.DataFrame:
    """applies preprocess_record to the whole dataset."""
    return df.apply(lambda row: pd.Series(preprocess_record(row.to_dict())), axis=1)


# Export & Summary

In [6]:
# Export & Summary

def save_processed_data(df: pd.DataFrame, path: str | Path) -> None:
    """saves the cleaned dataset to JSON or CSV."""
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    if p.suffix.lower() == '.csv':
        df.to_csv(p, index=False)
    else:
        df.to_json(p, orient='records', force_ascii=False)


def summarize_preprocessing(df: pd.DataFrame) -> None:
    """prints basic stats or sample processed data."""
    print('Rows:', len(df))
    print('Columns:', len(df.columns))
    print('Sample tokens:')
    display(df[['title_clean', 'tokens']].head(5))


# Run Processing Pipeline

In [4]:
# 1. Load the raw dataset
df_raw = load_data("../../data/fashion_products_dataset.json")

# 2. Inspect a few entries
inspect_data(df_raw)

# # 3. Preprocess the corpus
# df_processed = preprocess_corpus(df_raw)

# # 4. Summarize to verify everything looks fine
# summarize_preprocessing(df_processed)

# # 5. Save the processed dataset for future use
# save_processed_data(df_processed, "data/processed_corpus.json")

Shape: (28080, 17)
Columns: ['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url']


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,1612987911000,Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Women Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,66% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
10595,829713da-8bab-5942-b07e-a074297dc5bd,3199,3.6,True Bl,Clothing and Accessories,1612995368000,,60% off,[https://rukminim1.flixcart.com/image/128/128/...,False,SHTFEKS4JMHUSY56,"[{'Pack of': '1'}, {'Style Code': '20318324'},...",KAPSONSRETAILPVTLTD,1279,Topwear,Men Slim Fit Checkered Spread Collar Casual Shirt,https://www.flipkart.com/true-blue-men-checker...
11197,37fc4cdc-c5ad-59f5-8e5c-81f4a9f56ec5,1199,3.7,Steenb,Clothing and Accessories,1612995885000,good quality product for you.,45% off,[https://rukminim1.flixcart.com/image/128/128/...,False,SRTFYZCGHHFTRQBA,"[{'Fabric': 'Pure Cotton'}, {'Pattern': 'Solid...",BrandFortunes,649,Bottomwear,Solid Men Grey Regular Shorts,https://www.flipkart.com/steenbok-solid-men-gr...
5808,245f619d-a3e9-5086-affb-4848e967eba7,1199,3.0,Wab,Clothing and Accessories,1612991814000,"Wabba is built on the pillars of quality, reli...",25% off,[https://rukminim1.flixcart.com/image/128/128/...,False,SHTFV7KZPMCHXJYF,"[{'Pack of': '1'}, {'Model Name': 'WC-04-BLACK...",WabbaJea,899,Topwear,Men Tailored Fit Checkered Spread Collar Casua...,https://www.flipkart.com/wabba-men-checkered-c...
