# Setup & Data loading

In [1]:
import json
import re
import html
from pathlib import Path
from typing import Any, Dict, List, Tuple
import unicodedata

import pandas as pd
from bs4 import BeautifulSoup


def load_data(file_path: str | Path) -> pd.DataFrame:
    p = Path(file_path)
    with p.open('r', encoding='utf-8') as f:
        first = f.read(1)
        f.seek(0)
        if first == '[':
            data = json.load(f)
        else:
            data = [json.loads(line) for line in f if line.strip()]
    return pd.DataFrame(data)


def inspect_data(df: pd.DataFrame) -> None:
    print('Shape:', df.shape)
    print('Columns:', list(df.columns))
    display(df.head(3))
    display(df.sample(min(3, len(df))))


# Text Cleaning and Normalization

In [2]:
def clean_text(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    # Unescape HTML and strip tags
    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    # Normalize unicode (remove accents, etc.)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove punctuation, digits, and non-alphabetic characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Tokenization & Text Processing

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Run only once if it isn't downloaded already
#nltk.download('punkt')
#nltk.download('stopwords')

STOP_WORDS = set(stopwords.words('english'))
STEMMER = SnowballStemmer('english')


def tokenize_text(text: str) -> List[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    return word_tokenize(text)


def remove_stopwords(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in STOP_WORDS]


def stem_tokens(tokens: List[str]) -> List[str]:
    return [STEMMER.stem(t) for t in tokens if len(t) > 2] # We skip short tokens


def preprocess_text(text: str) -> List[str]:
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return tokens


# Record-Level Preprocessing

In [None]:
def preprocess_product_details(details: Any) -> str:
    if isinstance(details, list):
        parts = []
        for item in details:
            if isinstance(item, dict):
                for k, v in item.items():
                    parts.append(f"{k}: {v}")
            else:
                parts.append(str(item))
        return clean_text(' '.join(parts))
    return clean_text(str(details))


def normalize_numeric_fields(record: Dict[str, Any]) -> Dict[str, Any]:
    rec = dict(record)
    
    # Convert price fields
    for key in ('selling_price', 'actual_price'):
        if key in rec and rec[key] is not None:
            val = rec[key]
            if isinstance(val, (int, float)):
                continue
            s = str(val).replace(',', '').strip()  # remove thousands separators
            s = re.sub(r"[^0-9.\-]", "", s)       # keep only digits, dot, minus
            try:
                rec[key] = float(s) if '.' in s else int(s)
            except Exception:
                rec[key] = None
    
    # Convert discount field (e.g., "69% off" to 69)
    if 'discount' in rec and rec['discount'] is not None:
        s = str(rec['discount'])
        s = re.sub(r"[^0-9]", "", s)
        try:
            rec['discount'] = int(s)
        except Exception:
            rec['discount'] = None
    
    # Convert average_rating field
    if 'average_rating' in rec and rec['average_rating'] is not None:
        val = rec['average_rating']
        try:
            rec['average_rating'] = float(str(val).replace(',', '.').strip())
        except Exception:
            rec['average_rating'] = None
    
    # Ensure out_of_stock is boolean
    if 'out_of_stock' in rec:
        rec['out_of_stock'] = bool(rec['out_of_stock'])
    
    return rec

# Added seller and brand. PID does not have to be cleaned IMO. Out_of_stock is boolean, same. url is not cleaned because it is not a text field. Seems right??
def preprocess_record(record: Dict[str, Any]) -> Dict[str, Any]:
    rec = normalize_numeric_fields(record)
    title = clean_text(rec.get('title', ''))
    description = clean_text(rec.get('description', ''))
    category = clean_text(rec.get('category', ''))
    subcategory = clean_text(rec.get('subcategory', ''))
    brand = clean_text(rec.get("brand", ""))
    seller = clean_text(rec.get("seller", ""))
    # Handle different field names for product details across records
    details = preprocess_product_details(rec.get('details') or rec.get('product_details') or [])

    # Tokenize each field separately for flexible weighting in IR
    title_tokens = preprocess_text(title)
    brand_tokens = preprocess_text(brand)
    category_tokens = preprocess_text(category)
    subcategory_tokens = preprocess_text(subcategory)
    details_tokens = preprocess_text(details)
    # Combine description and seller as "other" text
    other_text = ' '.join([description, seller]).strip()
    description_tokens = preprocess_text(other_text)
    
    # Also keep combined tokens for convenience
    full_text = ' '.join([title, description, category, subcategory, brand, seller, details]).strip()
    tokens = preprocess_text(full_text)

    # Here we opt to update the original fields instead of creating new ones (both are valid approaches)
    rec.update({
        "title": title,
        "description": description,
        "category": category,
        "sub_category": subcategory,
        "brand": brand,
        "seller": seller,
        "product_details": details,
        "title_tokens": title_tokens,
        "brand_tokens": brand_tokens,
        "category_tokens": category_tokens,
        "subcategory_tokens": subcategory_tokens,
        "details_tokens": details_tokens,
        "description_tokens": description_tokens,
        "tokens": tokens,
        "full_text": full_text
    })
    return rec


def preprocess_corpus(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(lambda row: pd.Series(preprocess_record(row.to_dict())), axis=1)


# Export & Summary

In [5]:
def save_processed_data(df: pd.DataFrame, path: str | Path) -> None:
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    if p.suffix.lower() == '.csv':
        df.to_csv(p, index=False)
    else:
        df.to_json(p, orient='records', force_ascii=False)


def summarize_preprocessing(df: pd.DataFrame) -> None:
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    print(df.columns)

    if "tokens" in df.columns:
        token_lengths = df["tokens"].apply(lambda x: len(x) if isinstance(x, list) else 0)
        print(f"Average tokens per document: {token_lengths.mean():.2f}")
        unique_token_count = len(set(t for lst in df["tokens"] if isinstance(lst, list) for t in lst))
        print(f"Total unique tokens in corpus: {unique_token_count}")

    print("\nSample processed entries with separate token fields:")
    display(df[["pid", "title", "title_tokens", "brand_tokens", "category_tokens"]].head(5))


# Run Processing Pipeline

In [6]:
# 1. Load the raw dataset
df_raw = load_data("../../data/fashion_products_dataset.json")

# 2. Inspect a few entries
inspect_data(df_raw)

# 3. Preprocess the corpus
df_processed = preprocess_corpus(df_raw)

# 4. Summarize to verify everything looks fine
summarize_preprocessing(df_processed)

# 5. Save the processed dataset for future use
save_processed_data(df_processed, "data/processed_corpus.json")

Shape: (28080, 17)
Columns: ['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url']


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,1612987911000,Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Women Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,66% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
5874,da98f30d-4ad6-5969-91a9-4255ed70343a,849,3.2,East I,Clothing and Accessories,1612991852000,"Green Printed Boxers, Has An Inner Elastica...",28% off,[https://rukminim1.flixcart.com/image/128/128/...,False,BXRFTZCG8BGSBXES,"[{'Color': 'Green'}, {'Fabric': 'Pure Cotton'}...",ZIYAA,610,Innerwear and Swimwear,Printed Women Boxer (Pack of 1),https://www.flipkart.com/east-ink-printed-men-...
12010,e3645168-c57b-53ef-b209-39db7961960f,1199,2.5,ECKO Unl,Clothing and Accessories,1612996513000,ECKO Unltd Slim Fit Cotton Woven BLUE/NAVY BL...,33% off,[https://rukminim1.flixcart.com/image/128/128/...,False,SHTFVY2MNEEAFXVT,"[{'Pack of': '1'}, {'Style Code': 'EKSH000427'...",SandSMarketing,803,Topwear,Women Slim Fit Checkered Cut Away Collar Casua...,https://www.flipkart.com/ecko-unltd-men-checke...
8113,dda213e6-f41f-5c51-9ab3-c588b978035c,298,4.0,,Clothing and Accessories,1612993582000,All day comfort and support from work to play ...,10% off,[https://rukminim1.flixcart.com/image/128/128/...,False,VESFN5RUPZBH5ZQJ,"[{'Neck': 'Round Neck'}, {'Pattern': 'Solid'},...",SONUMONUGARMENT,267,Innerwear and Swimwear,VIP Women Vest (Pack of 3),https://www.flipkart.com/vip-men-vest/p/itm21f...


  text = BeautifulSoup(text, "html.parser").get_text(separator=" ")


Total rows: 28080
Total columns: 25
Index(['_id', 'actual_price', 'average_rating', 'brand', 'category',
       'crawled_at', 'description', 'discount', 'images', 'out_of_stock',
       'pid', 'product_details', 'seller', 'selling_price', 'sub_category',
       'title', 'url', 'title_tokens', 'brand_tokens', 'category_tokens',
       'subcategory_tokens', 'details_tokens', 'description_tokens', 'tokens',
       'full_text'],
      dtype='object')
Average tokens per document: 68.94
Total unique tokens in corpus: 8668

Sample processed entries with separate token fields:


Unnamed: 0,pid,title,title_tokens,brand_tokens,category_tokens
0,TKPFCZ9EA7H5FYZH,solid women multicolor track pants,"[solid, women, multicolor, track, pant]",[york],"[cloth, accessori]"
1,TKPFCZ9EJZV2UVRZ,solid men blue track pants,"[solid, men, blue, track, pant]",[york],"[cloth, accessori]"
2,TKPFCZ9EHFCY5Z4Y,solid men multicolor track pants,"[solid, men, multicolor, track, pant]",[york],"[cloth, accessori]"
3,TKPFCZ9ESZZ7YWEF,solid women multicolor track pants,"[solid, women, multicolor, track, pant]",[york],"[cloth, accessori]"
4,TKPFCZ9EVXKBSUD7,solid women brown grey track pants,"[solid, women, brown, grey, track, pant]",[york],"[cloth, accessori]"
