In [2]:
import pandas as pd
import numpy as np
import re

from collections import defaultdict

In [3]:
# Import items data
items_25 = pd.read_csv("../Final Project/items_2025.csv")

In [4]:
# Append product title to the end of breadcrumb as final breadcrumb
items_25['breadcrumb_title'] = items_25['product_breadcrumb'] + ',' + items_25['product_title']

In [5]:
# Create a function to split breadcrumbs into list
def breadcrumbs(product_breadcrumb, store_id, product_id):
    # Remove unwanted characters (e.g., [ ] " ')
    clean_breadcrumb = re.sub(r'[\]\[\"\']', '', product_breadcrumb)
    
    # Define possible splitters
    possible_splitters = [',', '/', '>']

    # Replace all splitters with a common delimiter (e.g., space) for uniform splitting
    for splitter in possible_splitters:
        clean_breadcrumb = clean_breadcrumb.replace(splitter, '|')

    # Split the breadcrumb into parts based on spaces
    split_breadcrumb = [i.strip() for i in clean_breadcrumb.split('|') if i.strip()]  # Strip spaces and remove empty parts

    return {product_id: split_breadcrumb}


# Dictionary to store breadcrumbs structured by store_id
breadcrumbs_dict = {}

def breadcrumb_run(df):
    for i in range(len(df)):
        store_id = df.loc[i, 'store_id']
        product_id = df.loc[i, 'product_id']
        product_breadcrumb = df.loc[i, 'breadcrumb_title']
        
        # Ensure store_id exists in dictionary
        if store_id not in breadcrumbs_dict:
            breadcrumbs_dict[store_id] = {}

        # Process and store breadcrumbs using the breadcrumbs function
        breadcrumbs_dict[store_id].update(breadcrumbs(product_breadcrumb, store_id, product_id))

In [6]:
# Apply breadcrumb run to items
breadcrumb_run(items_25)

In [17]:
# Define keyword categories
keyword_sets = {
    "Other": {"fabric", "shaver adapter", "beard oil", "food", "candle", "candles", "birthday card", "unperfumed"},
    "Body Wash": {"body wash", "shower gel", "body soap"},
    "Shampoo/Conditioner": {"shampoo", "hair wash", "hair cleanser", "conditioner", "hair softener", "hair mask", "hydrating mask", "hair moisturizer"},
    "Razors": {"razor", "shaver", "trimmer", "epilator"},
    "Shaving Cream": {"shaving cream", "shaving foam", "shaving gel", "shaving lotion", "shave cream", "razor cream", "shaving butter", "pre-shave cream", "lathering cream", "shaving soap"},
    "Perfume/Cologne": {"perfume", "cologne", "eau de toilette", "eau de parfum", "parfum", "aftershave", "body mist"}, 
    "Deodorant": {"deodorant", "antiperspirant", "deodorizing spray", "body spray"}
}

# Dictionary to store classification results: {store_id: {product_id: "Category"}}
categorized_products = defaultdict(dict)

for store_id, products in breadcrumbs_dict.items():
    for product_id, breadcrumb_list in products.items():
        if not breadcrumb_list:  # Skip empty breadcrumbs
            continue
        
        # If the second breadcrumb is "Perfumes" in Store 1, replace it with an empty string
        # There is an issue with TESCO where they put some non-perfume items under the perfume category. This removes those categories, so we don't pick up non-Perfumes
        if store_id == 1 and len(breadcrumb_list) > 2 and breadcrumb_list[1].lower() == "perfumes" and breadcrumb_list[2].lower() == "aftershaves & gift sets":
            breadcrumb_list[1] = ""  # Replace the second-to-last breadcrumb with an empty string
            breadcrumb_list[2] = ""  # Replace the third-to-last breadcrumb with an empty string
        
        # Iterate from last breadcrumb backwards
        classified = False
        for breadcrumb in reversed(breadcrumb_list):
            breadcrumb_lower = breadcrumb.lower()
            
            # Check each keyword category
            for category, keywords in keyword_sets.items():
                if any(keyword in breadcrumb_lower for keyword in keywords):
                    categorized_products[store_id][product_id] = category
                    classified = True
                    break  # Stop searching once classified
            
            if classified:
                break  # Stop searching once classified
        
        # If no match was found, classify as "Other"
        if not classified:
            categorized_products[store_id][product_id] = "Other"


In [21]:
# View breadcrumb categorization for "eye test" validation
breadcrumbs = defaultdict(dict)

for store_id, products in breadcrumbs_dict.items():
    for product_id, breadcrumb_list in products.items():
        if categorized_products[store_id][product_id] == "Razors":  # Only store Non-Food items
            breadcrumbs[store_id][product_id] = breadcrumb_list

# Print results
for store_id, products in breadcrumbs.items():
    print(f"Store {store_id} (Razors Items):")
    for product_id, breadcrumb in products.items():
        print(f"  Product {product_id}: {' | '.join(breadcrumb)}")
    print()

Store 3 (Razors Items):
  Product 910001747207: Toiletries & Beauty | Womens Toiletries | Womens Shaving & Hair Removal | Womens Razors & Blades | Venus Extra Smooth 5 Blade Razor
  Product 910001747818: Toiletries & Beauty | Womens Toiletries | Womens Shaving & Hair Removal | Womens Razors & Blades | Venus Embrace 5 Blade Razor Blades
  Product 910002269927: Toiletries & Beauty | Womens Toiletries | Womens Shaving & Hair Removal | Womens Razors & Blades | Venus Swirl Flexiball Womens Razor
  Product 910002269859: Toiletries & Beauty | Womens Toiletries | Womens Shaving & Hair Removal | Womens Razors & Blades | Venus Swirl Womens 5 Blade Razor Blades Refill
  Product 910001981410: Home & Entertainment | Technology & Electricals | Beauty Electricals | Womens Electrical Grooming | TrueSmooth Wet & Dry Battery Lady Shaver - 8771BU
  Product 910001295856: Toiletries & Beauty | Womens Toiletries | Womens Shaving & Hair Removal | Womens Razors & Blades | Venus 2 Blade Disposable Razors 4 Pac

In [42]:
# Convert the categorized dictionary to a DataFrame
items = []

for store_id, products in categorized_products.items():
    for product_id, category in products.items():
        breadcrumb_list = breadcrumbs_dict.get(store_id, {}).get(product_id, [])
        breadcrumb_clean = " | ".join(breadcrumb_list)  # Combine the breadcrumb list into a single string
        items.append({
            "store_id": store_id,
            "product_id": product_id,
            "breadcrumb_clean": breadcrumb_clean,
            "category": category
        })

# Create the pandas DataFrame
items_bc = pd.DataFrame(items)

In [43]:
items_bc.groupby('category')['product_id'].count()

category
Body Wash                 789
Deodorant                 773
Other                  113590
Perfume/Cologne           139
Razors                    567
Shampoo/Conditioner      1616
Shaving Cream             106
Name: product_id, dtype: int64

In [40]:
# Import item descriptions
items_desc = pd.read_csv('/Users/sambickel-barlow/Desktop/PP422/Final Project/all_desc_processed.csv')

  items_desc = pd.read_csv('/Users/sambickel-barlow/Desktop/PP422/Final Project/all_desc_processed.csv')


In [52]:
# Merge item descriptions with product categories based on breadcrerumbs
items_merge = items_desc.merge(items_bc, how='left', on=['store_id','product_id'])

In [59]:
# limit items to those that are categorized
items_limit = items_merge[(~items_merge['category'].isna()) & (items_merge['category'] != 'Other')]

In [60]:
# See n for each category
items_limit.groupby('category')['product_id'].count()

category
Body Wash              351
Deodorant              404
Perfume/Cologne         72
Razors                 274
Shampoo/Conditioner    741
Shaving Cream           63
Name: product_id, dtype: int64