# Setup & Data loading

In [1]:
import json
import re
import html
from pathlib import Path
from typing import Any, Dict, List, Tuple
import unicodedata

import pandas as pd
from bs4 import BeautifulSoup


def load_data(file_path: str | Path) -> pd.DataFrame:
    p = Path(file_path)
    with p.open('r', encoding='utf-8') as f:
        first = f.read(1)
        f.seek(0)
        if first == '[':
            data = json.load(f)
        else:
            data = [json.loads(line) for line in f if line.strip()]
    return pd.DataFrame(data)


def inspect_data(df: pd.DataFrame) -> None:
    print('Shape:', df.shape)
    print('Columns:', list(df.columns))
    display(df.head(3))
    display(df.sample(min(3, len(df))))


# Text Cleaning and Normalization

In [8]:
def clean_text(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    # Unescape HTML and strip tags
    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    # Normalize unicode (remove accents, etc.)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

    # Lowercase
    text = text.lower()

    # Normalize common clothing terms: "t shirt" -> "t-shirt", "v neck" -> "v-neck", etc.
    text = re.sub(r"\bt\s+shirt\b", "t-shirt", text)
    text = re.sub(r"\bv\s+neck\b", "v-neck", text)
    text = re.sub(r"\bround\s+neck\b", "round-neck", text)
    text = re.sub(r"\bpolo\s+neck\b", "polo-neck", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove punctuation, digits, and non-alphabetic characters (but preserve hyphens)
    text = re.sub(r"[^a-z\s-]", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Tokenization & Text Processing

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Run only once if it isn't downloaded already
# nltk.download('punkt')
# nltk.download('punkt_tab')  
# nltk.download('stopwords')

STOP_WORDS = set(stopwords.words('english'))
STEMMER = SnowballStemmer('english')


def tokenize_text(text: str) -> List[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    return word_tokenize(text)


def remove_stopwords(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in STOP_WORDS]


def stem_tokens(tokens: List[str]) -> List[str]:
    return [STEMMER.stem(t) for t in tokens if len(t) > 2] # We skip short tokens


def preprocess_text(text: str) -> List[str]:
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return tokens


# Record-Level Preprocessing

In [10]:
# Demonstration on real dataset examples
df_raw = load_data("../../data/fashion_products_dataset.json")

# Pick two random entries from the dataset
sample = df_raw.sample(2)[["pid", "title", "description"]]

for _, row in sample.iterrows():
    pid = row.get("pid", "")
    title = str(row.get("title", ""))
    desc = str(row.get("description", ""))
    text = f"{title}. {desc}".strip()
    cleaned = clean_text(text)
    tokens = preprocess_text(text)
    print(f"PID: {pid}")
    print("Raw:       ", text[:250] + ("..." if len(text) > 250 else ""))
    print("Cleaned:   ", cleaned[:250] + ("..." if len(cleaned) > 250 else ""))
    print("Tokens:    ", tokens[:25], ("... (truncated)" if len(tokens) > 25 else ""))
    print("-")


PID: TSHFUF2ZYHVM2GWT
Raw:        Printed Men Round Neck Grey T-Shirt. Look trendy and feel comfortable with this Crew neck Tshirt featuring MULAN. Crafted out of 100% Cotton, which is biowashed for smooth feel and befriend to skin, this featuring can be worn for any occasion, a casu...
Cleaned:    printed men round-neck grey t-shirt look trendy and feel comfortable with this crew neck tshirt featuring mulan crafted out of cotton which is biowashed for smooth feel and befriend to skin this featuring can be worn for any occasion a casual day at ...
Tokens:     ['print', 'men', 'round-neck', 'grey', 't-shirt', 'look', 'trendi', 'feel', 'comfort', 'crew', 'neck', 'tshirt', 'featur', 'mulan', 'craft', 'cotton', 'biowash', 'smooth', 'feel', 'befriend', 'skin', 'featur', 'worn', 'occas', 'casual'] ... (truncated)
-
PID: BXRFT7BGGMTXK33K
Raw:        Checkered Women Boxer  (Pack of 3).
Cleaned:    checkered women boxer pack of
Tokens:     ['checker', 'women', 'boxer', 'pack'] 
-


In [11]:

def preprocess_product_details(details: Any) -> str: 
    if isinstance(details, list):
        parts = []
        for item in details:
            if isinstance(item, dict):
                for k, v in item.items():
                    parts.append(f"{k}: {v}")
            else:
                parts.append(str(item))
        return clean_text(' '.join(parts))
    return clean_text(str(details))


def normalize_numeric_fields(record: Dict[str, Any]) -> Dict[str, Any]:
    rec = dict(record)
    
    # Convert price fields
    for key in ('selling_price', 'actual_price'):
        if key in rec and rec[key] is not None:
            val = rec[key]  
            if isinstance(val, (int, float)):
                continue
            s = str(val).replace(',', '').strip()  # remove thousands separators
            s = re.sub(r"[^0-9.\-]", "", s)       # keep only digits, dot, minus
            try:
                rec[key] = float(s) if '.' in s else int(s)
            except Exception:
                rec[key] = None
    
    # Convert discount field ("69% off" to 69)
    if 'discount' in rec and rec['discount'] is not None:
        s = str(rec['discount'])
        s = re.sub(r"[^0-9]", "", s)
        try:
            rec['discount'] = int(s)
        except Exception:
            rec['discount'] = None
    
    # Convert average_rating field
    if 'average_rating' in rec and rec['average_rating'] is not None:
        val = rec['average_rating']
        try:
            rec['average_rating'] = float(str(val).replace(',', '.').strip())
        except Exception:
            rec['average_rating'] = None
    
    # Ensure out_of_stock is boolean
    if 'out_of_stock' in rec:
        rec['out_of_stock'] = bool(rec['out_of_stock'])
    
    return rec

# Added seller and brand. PID does not have to be cleaned. Out_of_stock is boolean, same. URL is not cleaned because it is not a text field.
def preprocess_record(record: Dict[str, Any]) -> Dict[str, Any]:
    
    rec = normalize_numeric_fields(record)
    title = clean_text(rec.get('title', ''))
    description = clean_text(rec.get('description', ''))
    category = clean_text(rec.get('category', ''))
    subcategory = clean_text(rec.get('sub_category', ''))  # Fixed: was 'subcategory', now 'sub_category'
    brand = clean_text(rec.get("brand", ""))
    seller = clean_text(rec.get("seller", ""))

    # Handle different field names for product details across records
    details = preprocess_product_details(rec.get('product_details'))

    # Tokenize each field separately for flexible weighting in IR
    title_tokens = preprocess_text(title)
    brand_tokens = preprocess_text(brand)
    category_tokens = preprocess_text(category)
    subcategory_tokens = preprocess_text(subcategory)
    details_tokens = preprocess_text(details)
    seller_tokens = preprocess_text(seller)
    description_tokens = preprocess_text(description)
    
    # Also keep combined tokens for convenience
    full_text = ' '.join([title, description, category, subcategory, brand, seller, details]).strip()
    tokens = preprocess_text(full_text)

    # Here we opt to update the original fields instead of creating new ones
    rec.update({
        "title": title,
        "description": description,
        "category": category,
        "sub_category": subcategory,
        "brand": brand,
        "seller": seller,
        "product_details": details,
        "title_tokens": title_tokens,
        "brand_tokens": brand_tokens,
        "category_tokens": category_tokens,
        "subcategory_tokens": subcategory_tokens,
        "details_tokens": details_tokens,
        "seller_tokens": seller_tokens,
        "description_tokens": description_tokens,
        "tokens": tokens,
        "full_text": full_text
    })
    return rec


def preprocess_corpus(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(lambda row: pd.Series(preprocess_record(row.to_dict())), axis=1)


# Export & Summary

In [12]:
def save_processed_data(df: pd.DataFrame, path: str | Path) -> None:
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    if p.suffix.lower() == '.csv':
        df.to_csv(p, index=False)
    else:
        df.to_json(p, orient='records', force_ascii=False)


def summarize_preprocessing(df: pd.DataFrame) -> None:
    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    print(df.columns)

    if "tokens" in df.columns:
        token_lengths = df["tokens"].apply(lambda x: len(x) if isinstance(x, list) else 0)
        print(f"Average tokens per document: {token_lengths.mean():.2f}")
        unique_token_count = len(set(t for lst in df["tokens"] if isinstance(lst, list) for t in lst))
        print(f"Total unique tokens in corpus: {unique_token_count}")

    print("\nSample processed entries with separate token fields:")
    display(df[["pid", "title", "title_tokens", "brand_tokens", "category_tokens"]].head(5))


# Run Processing Pipeline

In [None]:
# Inspect a few entries
inspect_data(df_raw)

# Preprocess the corpus
df_processed = preprocess_corpus(df_raw)

# Summarize to verify everything looks fine
summarize_preprocessing(df_processed)

# Save the processed dataset for future use
save_processed_data(df_processed, "data/processed_corpus.json")

Shape: (28080, 17)
Columns: ['_id', 'actual_price', 'average_rating', 'brand', 'category', 'crawled_at', 'description', 'discount', 'images', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub_category', 'title', 'url']


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,1612987911000,Yorker trackpants made from 100% rich combed c...,69% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Women Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,66% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,68% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
8437,83fd059b-ec55-5aa0-9c94-c09fec73d1d9,235,2.3,,Clothing and Accessories,1612993781000,Good quality product,14% off,[https://rukminim1.flixcart.com/image/128/128/...,False,VESFR7AV2HNHS564,"[{'Pattern': 'Solid'}, {'Sleeve': 'Sleeveless'...",Bigretail NXT2.3Seller changed. Check for any ...,200,Innerwear and Swimwear,VIP Men Vest,https://www.flipkart.com/vip-men-vest/p/itm52c...
21016,3a5ec2b8-304e-5fb3-afdd-88414440bc62,419,4.1,,Clothing and Accessories,1613002492000,EXPERIENCE YOURSELF THE MOST COMFORTABLE PRODU...,14% off,[https://rukminim1.flixcart.com/image/128/128/...,False,TKPFHPFKVD69PVE4,"[{'Style Code': 'COOL-TS134_AML_M'}, {'Closure...",T T Limited,357,Bottomwear,Solid Women Grey Track Pants,https://www.flipkart.com/tt-solid-men-grey-tra...
7033,849683cb-4659-5e22-9222-5c95c14e5582,2599,3.3,Lafant,Clothing and Accessories,1612992694000,Style and comfort blend well in this Jacket fr...,65% off,[https://rukminim1.flixcart.com/image/128/128/...,True,JCKFCVY8PCH3ZFAF,"[{'Color': 'Red, Blue'}, {'Fabric': 'Nylon'}, ...",,890,Winter Wear,Full Sleeve Color Block Women Casual Jacket,https://www.flipkart.com/lafantar-full-sleeve-...
