# Phase 1: Exploratory Data Analysis (EDA)
## Torob Product Retrieval RAG System

**Objective**: Understand the dataset structure, identify data quality issues, and gain insights for preprocessing.

**Dataset**: Torob E-commerce Platform
- 9 relational tables
- ~1M product records
- User interaction logs (searches, views, clicks)

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import json

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

‚úÖ Libraries imported successfully!
Pandas version: 2.3.2
NumPy version: 2.2.6


## 2. Load All Datasets

Loading the 9 Torob dataset tables from parquet files.

In [2]:
# Define data path
DATA_PATH = Path('../Data/')

# Load all tables
print("Loading datasets...")
print("-" * 60)

# Core product tables
base_products = pd.read_parquet(DATA_PATH / 'base_products.parquet')
members = pd.read_parquet(DATA_PATH / 'members.parquet')

# User interaction tables
searches = pd.read_parquet(DATA_PATH / 'searches.parquet')
base_views = pd.read_parquet(DATA_PATH / 'base_views.parquet')
final_clicks = pd.read_parquet(DATA_PATH / 'final_clicks.parquet')

# Reference tables
shops = pd.read_parquet(DATA_PATH / 'shops.parquet')
categories = pd.read_parquet(DATA_PATH / 'categories.parquet')
brands = pd.read_parquet(DATA_PATH / 'brands.parquet')
cities = pd.read_parquet(DATA_PATH / 'cities.parquet')

print("‚úÖ All datasets loaded successfully!")
print("-" * 60)

# Summary of loaded datasets
datasets = {
    'base_products': base_products,
    'members': members,
    'searches': searches,
    'base_views': base_views,
    'final_clicks': final_clicks,
    'shops': shops,
    'categories': categories,
    'brands': brands,
    'cities': cities
}

for name, df in datasets.items():
    print(f"{name:20s}: {df.shape[0]:>10,} rows √ó {df.shape[1]:>3} columns")

Loading datasets...
------------------------------------------------------------
‚úÖ All datasets loaded successfully!
------------------------------------------------------------
base_products       :  1,022,298 rows √ó   8 columns
members             :  1,948,665 rows √ó   4 columns
searches            :    588,347 rows √ó   9 columns
base_views          :    199,916 rows √ó   4 columns
final_clicks        :     17,371 rows √ó   4 columns
shops               :     23,342 rows √ó   4 columns
categories          :        746 rows √ó   3 columns
brands              :      2,025 rows √ó   2 columns
cities              :        651 rows √ó   2 columns


## 3. Initial Data Inspection

### 3.1 Base Products - Core Product Information

### 3.2 Missing Values Analysis

### 3.3 Statistical Summary

## 4. Distinct Values Analysis - Key Columns

Checking uniqueness and cardinality of important columns.

### 4.1 Product Identity Columns

In [None]:
print("=" * 80)
print("UNIQUENESS CHECK")
print("=" * 80)

# Base Products
print(f"\n BASE PRODUCTS:")
print(f"   Total rows: {len(base_products):,}")
print(f"   Unique random_keys: {base_products['random_key'].nunique():,}")
print(f"   Unique persian_name: {base_products['persian_name'].nunique():,}")
print(f"   Unique english_name: {base_products['english_name'].nunique():,}")


# Members (Shop Products)
print(f"\n MEMBERS")
print(f"   Total rows: {len(members):,}")
print(f"   Unique random_keys: {members['random_key'].nunique():,}")
print(f"   Unique base_random_keys: {members['base_random_key'].nunique():,}")
print(f"   Unique shop_id: {members['shop_id'].nunique():,}")

# Check relationship integrity
print(f"\n RELATIONSHIP CHECK:")
base_keys = set(base_products['random_key'])
member_base_keys = set(members['base_random_key'])
print(f"   Members pointing to non-existent base products: {len(member_base_keys - base_keys):,}")
print(f"   Base products with no members: {len(base_keys - member_base_keys):,}")
print(f"\n")
print("=" * 80)
print("SOME INFORMATION ABOUT OTHER TABLES")
print("=" * 80)

# Categories
print(f"\n CATEGORIES:")
print(f"   Total categories defined: {len(categories):,}")
print(f"   Categories used in base_products: {base_products['category_id'].nunique():,}")
print(f"   Categories used in searches: {searches['category_id'].nunique():,}")
print(f"   Products without category : {(base_products['category_id'].isnull() | (base_products['category_id'] == 0)).sum():,}")

# Brands
print(f"\n  BRANDS:")
print(f"   Total brands defined: {len(brands):,}")
print(f"   Brands used in base_products: {base_products['brand_id'].nunique():,}")
print(f"   Products without brand : {(base_products['brand_id'].isnull() | (base_products['brand_id'] == 0)).sum():,}")

# Shops
print(f"\n SHOPS:")
print(f"   Total shops defined: {len(shops):,}")
print(f"   Shops used in members: {members['shop_id'].nunique():,}")
print(f"   Shops used in final_clicks: {final_clicks['shop_id'].nunique():,}")

# Cities
print(f"\n CITIES:")
print(f"   Total cities defined: {len(cities):,}")
print(f"   Cities used in shops: {shops['city_id'].nunique():,}")

# ID Uniqueness Check for all tables
print(f"\n")
print("=" * 80)
print("ID UNIQUENESS CHECK FOR ALL TABLES")
print("=" * 80)

print(f"\nüìã SEARCHES:")
print(f"   Total rows: {len(searches):,}")
print(f"   Unique ids: {searches['id'].nunique():,}")
print(f"   Is unique? {searches['id'].is_unique}")

print(f"\nüìã BASE_VIEWS:")
print(f"   Total rows: {len(base_views):,}")
print(f"   Unique ids: {base_views['id'].nunique():,}")
print(f"   Is unique? {base_views['id'].is_unique}")

print(f"\nüìã FINAL_CLICKS:")
print(f"   Total rows: {len(final_clicks):,}")
print(f"   Unique ids: {final_clicks['id'].nunique():,}")
print(f"   Is unique? {final_clicks['id'].is_unique}")

print(f"\nüìã SHOPS:")
print(f"   Total rows: {len(shops):,}")
print(f"   Unique ids: {shops['id'].nunique():,}")
print(f"   Is unique? {shops['id'].is_unique}")

print(f"\nüìã CATEGORIES:")
print(f"   Total rows: {len(categories):,}")
print(f"   Unique ids: {categories['id'].nunique():,}")
print(f"   Is unique? {categories['id'].is_unique}")

print(f"\nüìã BRANDS:")
print(f"   Total rows: {len(brands):,}")
print(f"   Unique ids: {brands['id'].nunique():,}")
print(f"   Is unique? {brands['id'].is_unique}")

print(f"\nüìã CITIES:")
print(f"   Total rows: {len(cities):,}")
print(f"   Unique ids: {cities['id'].nunique():,}")
print(f"   Is unique? {cities['id'].is_unique}")

UNIQUENESS CHECK

 BASE PRODUCTS:
   Total rows: 1,022,298
   Unique random_keys: 1,022,298
   Unique persian_name: 953,449
   Unique english_name: 53,122

 MEMBERS
   Total rows: 1,948,665
   Unique random_keys: 1,948,665
   Unique base_random_keys: 1,022,294
   Unique base_random_keys: 23,342

 RELATIONSHIP CHECK:
   Members pointing to non-existent base products: 0
   Base products with no members: 4


SOME INFORMATION ABOUT OTHER TABLES

 CATEGORIES:
   Total categories defined: 746
   Categories used in base_products: 687
   Categories used in searches: 658
   Products without category : 0

  BRANDS:
   Total brands defined: 2,025
   Brands used in base_products: 1,908
   Products without brand : 0

 SHOPS:
   Total shops defined: 23,342
   Shops used in members: 23,342
   Shops used in final_clicks: 5,542

 CITIES:
   Total cities defined: 651
   Cities used in shops: 651


ID UNIQUENESS CHECK FOR ALL TABLES

üìã SEARCHES:
   Total rows: 588,347
   Unique ids: 588,347
   Is uniq

### 4.2 Classification Columns (Categories, Brands, Shops)

### 4.3 Text & Image Data Availability

In [None]:
print("=" * 80)
print("PRICE ANALYSIS - MEMBERS TABLE")
print("=" * 80)

# Basic statistics
print(f"\nüìä PRICE STATISTICS:")
print(f"   Total products: {len(members):,}")
print(f"   Mean price: {members['price'].mean():,.0f} Toman")
print(f"   Median price: {members['price'].median():,.0f} Toman")
print(f"   Std deviation: {members['price'].std():,.0f} Toman")
print(f"   Min price: {members['price'].min():,.0f} Toman")
print(f"   Max price: {members['price'].max():,.0f} Toman")

# Quartiles
print(f"\nüìà QUARTILES:")
print(f"   25th percentile: {members['price'].quantile(0.25):,.0f} Toman")
print(f"   50th percentile: {members['price'].quantile(0.50):,.0f} Toman")
print(f"   75th percentile: {members['price'].quantile(0.75):,.0f} Toman")
print(f"   95th percentile: {members['price'].quantile(0.95):,.0f} Toman")
print(f"   99th percentile: {members['price'].quantile(0.99):,.0f} Toman")

# Zero and outlier analysis
zero_prices = (members['price'] == 0).sum()
negative_prices = (members['price'] < 0).sum()
very_high_prices = (members['price'] > 100_000_000_000).sum()  # > 100 billion

print(f"\n‚ö†Ô∏è  DATA QUALITY ISSUES:")
print(f"   Zero prices: {zero_prices:,} ({zero_prices/len(members)*100:.2f}%)")
print(f"   Negative prices: {negative_prices:,} ({negative_prices/len(members)*100:.2f}%)")
print(f"   Very high prices (>100B): {very_high_prices:,} ({very_high_prices/len(members)*100:.2f}%)")

# Price ranges for distribution
ranges = [
    ("zero", 0, 0, "Zero"),
    ("range", 1, 100_000, "< 100K"),
    ("range", 100_000, 1_000_000, "100K - 1M"),
    ("range", 1_000_000, 10_000_000, "1M - 10M"),
    ("range", 10_000_000, 100_000_000, "10M - 100M"),
    ("range", 100_000_000, 1_000_000_000, "100M - 1B"),
    ("range", 1_000_000_000, float('inf'), "> 1B")
]

# Calculate counts and percentages
range_labels = []
range_counts = []
range_pcts = []

for range_type, low, high, label in ranges:
    if range_type == "zero":
        count = (members['price'] == 0).sum()
    elif high == float('inf'):
        count = (members['price'] >= low).sum()
    else:
        count = ((members['price'] >= low) & (members['price'] < high)).sum()
    pct = count / len(members) * 100
    range_labels.append(label)
    range_counts.append(count)
    range_pcts.append(pct)

# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Price Distribution (without outliers - 95th percentile)
price_95 = members['price'].quantile(0.95)
axes[0].hist(members[members['price'] <= price_95]['price'], bins=100, edgecolor='black', alpha=0.7, color='orange')
axes[0].set_xlabel('Price (Toman)')
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'Price Distribution (Up to 95th percentile: {price_95:,.0f})')
axes[0].ticklabel_format(style='plain', axis='x')

# 2. Box plot
axes[1].boxplot(members[members['price'] <= price_95]['price'], vert=False)
axes[1].set_xlabel('Price (Toman)')
axes[1].set_title(f'Box Plot (Up to 95th percentile)')
axes[1].ticklabel_format(style='plain', axis='x')

# 3. Price Range Distribution Bar Chart
axes[2].bar(range_labels, range_counts, edgecolor='black', alpha=0.7, color='steelblue')
axes[2].set_xlabel('Price Range')
axes[2].set_ylabel('Count')
axes[2].set_title('Price Range Distribution')
axes[2].tick_params(axis='x', rotation=45)
axes[2].ticklabel_format(style='plain', axis='y')

# Add percentage labels on bars
for i, (count, pct) in enumerate(zip(range_counts, range_pcts)):
    axes[2].text(i, count, f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Print price range distribution
print(f"\nüìä PRICE RANGE DISTRIBUTION:")
for label, count, pct in zip(range_labels, range_counts, range_pcts):
    print(f"   {label:20s}: {count:>10,} ({pct:>5.2f}%)")

### 4.4 User Interaction Data

In [None]:
print("=" * 80)
print("CLASS IMBALANCE ANALYSIS - CATEGORIES & BRANDS")
print("=" * 80)

# ===== CATEGORY IMBALANCE ANALYSIS =====
print(f"\n{'='*80}")
print("CATEGORY DISTRIBUTION ANALYSIS")
print("="*80)

# Get category distribution
category_counts = base_products['category_id'].value_counts().sort_values(ascending=False)
total_products = len(base_products)

print(f"\nüìä CATEGORY STATISTICS:")
print(f"   Total categories in use: {base_products['category_id'].nunique():,}")
print(f"   Total defined categories: {len(categories):,}")
print(f"   Products without category: {(base_products['category_id'].isnull() | (base_products['category_id'] == 0)).sum():,}")

# Top and bottom categories
print(f"\nüîù TOP 10 CATEGORIES:")
for idx, (cat_id, count) in enumerate(category_counts.head(10).items(), 1):
    pct = count / total_products * 100
    cat_name = categories[categories['id'] == cat_id]['title'].values
    cat_name = cat_name[0] if len(cat_name) > 0 else "Unknown"
    print(f"   {idx:2d}. {cat_name[:40]:40s} | ID: {cat_id:6.0f} | Count: {count:>8,} ({pct:>5.2f}%)")

print(f"\nüîª BOTTOM 10 CATEGORIES:")
for idx, (cat_id, count) in enumerate(category_counts.tail(10).items(), 1):
    pct = count / total_products * 100
    cat_name = categories[categories['id'] == cat_id]['title'].values
    cat_name = cat_name[0] if len(cat_name) > 0 else "Unknown"
    print(f"   {idx:2d}. {cat_name[:40]:40s} | ID: {cat_id:6.0f} | Count: {count:>8,} ({pct:>5.2f}%)")

# Calculate imbalance metrics
top_category_pct = (category_counts.iloc[0] / total_products) * 100
bottom_category_pct = (category_counts.iloc[-1] / total_products) * 100
imbalance_ratio = category_counts.iloc[0] / category_counts.iloc[-1]

print(f"\n‚öñÔ∏è  CATEGORY IMBALANCE METRICS:")
print(f"   Top category share: {top_category_pct:.2f}%")
print(f"   Bottom category share: {bottom_category_pct:.4f}%")
print(f"   Imbalance ratio (top/bottom): {imbalance_ratio:,.2f}x")
print(f"   Categories with <100 products: {(category_counts < 100).sum():,}")
print(f"   Categories with <10 products: {(category_counts < 10).sum():,}")

# ===== BRAND IMBALANCE ANALYSIS =====
print(f"\n{'='*80}")
print("BRAND DISTRIBUTION ANALYSIS")
print("="*80)

# Get brand distribution (including all brands)
brand_counts = base_products['brand_id'].value_counts().sort_values(ascending=False)

print(f"\nüìä BRAND STATISTICS (ALL):")
print(f"   Total brands in use: {base_products['brand_id'].nunique():,}")
print(f"   Total defined brands: {len(brands):,}")
print(f"   Products without brand: {(base_products['brand_id'].isnull() | (base_products['brand_id'] == 0)).sum():,}")
print(f"   Products with unknown brand (ID=-1): {(base_products['brand_id'] == -1).sum():,} ({(base_products['brand_id'] == -1).sum()/len(base_products)*100:.2f}%)")

# Top and bottom brands
print(f"\nüîù TOP 10 BRANDS:")
for idx, (brand_id, count) in enumerate(brand_counts.head(10).items(), 1):
    pct = count / total_products * 100
    brand_name = brands[brands['id'] == brand_id]['title'].values
    brand_name = brand_name[0] if len(brand_name) > 0 else "Unknown"
    print(f"   {idx:2d}. {brand_name[:40]:40s} | ID: {brand_id:6.0f} | Count: {count:>8,} ({pct:>5.2f}%)")

print(f"\nüîª BOTTOM 10 BRANDS:")
for idx, (brand_id, count) in enumerate(brand_counts.tail(10).items(), 1):
    pct = count / total_products * 100
    brand_name = brands[brands['id'] == brand_id]['title'].values
    brand_name = brand_name[0] if len(brand_name) > 0 else "Unknown"
    print(f"   {idx:2d}. {brand_name[:40]:40s} | ID: {brand_id:6.0f} | Count: {count:>8,} ({pct:>5.2f}%)")

# Calculate imbalance metrics (all brands)
top_brand_pct = (brand_counts.iloc[0] / total_products) * 100
bottom_brand_pct = (brand_counts.iloc[-1] / total_products) * 100
brand_imbalance_ratio = brand_counts.iloc[0] / brand_counts.iloc[-1]

print(f"\n‚öñÔ∏è  BRAND IMBALANCE METRICS (ALL):")
print(f"   Top brand share: {top_brand_pct:.2f}%")
print(f"   Bottom brand share: {bottom_brand_pct:.4f}%")
print(f"   Imbalance ratio (top/bottom): {brand_imbalance_ratio:,.2f}x")
print(f"   Brands with <100 products: {(brand_counts < 100).sum():,}")
print(f"   Brands with <10 products: {(brand_counts < 10).sum():,}")

# ===== BRAND ANALYSIS (EXCLUDING ID = -1) =====
print(f"\n{'='*80}")
print("BRAND DISTRIBUTION ANALYSIS (EXCLUDING UNKNOWN BRAND ID=-1)")
print("="*80)

# Filter out brand_id == -1
base_products_known_brands = base_products[base_products['brand_id'] != -1]
brand_counts_filtered = base_products_known_brands['brand_id'].value_counts().sort_values(ascending=False)
total_products_filtered = len(base_products_known_brands)

print(f"\nüìä BRAND STATISTICS (EXCLUDING ID=-1):")
print(f"   Total products with known brands: {total_products_filtered:,}")
print(f"   Unique brands in use: {base_products_known_brands['brand_id'].nunique():,}")

# Top brands (excluding -1)
print(f"\nüîù TOP 10 BRANDS (EXCLUDING ID=-1):")
for idx, (brand_id, count) in enumerate(brand_counts_filtered.head(10).items(), 1):
    pct = count / total_products_filtered * 100
    brand_name = brands[brands['id'] == brand_id]['title'].values
    brand_name = brand_name[0] if len(brand_name) > 0 else "Unknown"
    print(f"   {idx:2d}. {brand_name[:40]:40s} | ID: {brand_id:6.0f} | Count: {count:>8,} ({pct:>5.2f}%)")

# Calculate imbalance metrics (excluding -1)
top_brand_pct_filtered = (brand_counts_filtered.iloc[0] / total_products_filtered) * 100
bottom_brand_pct_filtered = (brand_counts_filtered.iloc[-1] / total_products_filtered) * 100
brand_imbalance_ratio_filtered = brand_counts_filtered.iloc[0] / brand_counts_filtered.iloc[-1]

print(f"\n‚öñÔ∏è  BRAND IMBALANCE METRICS (EXCLUDING ID=-1):")
print(f"   Top brand share: {top_brand_pct_filtered:.2f}%")
print(f"   Bottom brand share: {bottom_brand_pct_filtered:.4f}%")
print(f"   Imbalance ratio (top/bottom): {brand_imbalance_ratio_filtered:,.2f}x")
print(f"   Brands with <100 products: {(brand_counts_filtered < 100).sum():,}")
print(f"   Brands with <10 products: {(brand_counts_filtered < 10).sum():,}")

# ===== VISUALIZATIONS =====
fig, axes = plt.subplots(2, 3, figsize=(22, 12))

# 1. Top 20 Categories Bar Chart
top_20_cats = category_counts.head(20)
cat_names = []
for cat_id in top_20_cats.index:
    cat_name = categories[categories['id'] == cat_id]['title'].values
    cat_name = cat_name[0][:30] if len(cat_name) > 0 else f"ID_{cat_id}"
    # Apply Persian text fix
    cat_names.append(fix_persian_text(cat_name))

axes[0, 0].barh(range(len(top_20_cats)), top_20_cats.values, color='steelblue', alpha=0.7)
axes[0, 0].set_yticks(range(len(top_20_cats)))
axes[0, 0].set_yticklabels(cat_names, fontsize=9)
axes[0, 0].set_xlabel('Number of Products')
axes[0, 0].set_title('Top 20 Categories by Product Count')
axes[0, 0].invert_yaxis()
axes[0, 0].ticklabel_format(style='plain', axis='x')

# 2. Category Distribution - Log Scale
axes[0, 1].hist(category_counts.values, bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[0, 1].set_xlabel('Number of Products per Category')
axes[0, 1].set_ylabel('Number of Categories')
axes[0, 1].set_title('Category Size Distribution (Log Scale)')
axes[0, 1].set_yscale('log')
axes[0, 1].ticklabel_format(style='plain', axis='x')

# 3. Top 20 Brands Bar Chart (All brands including -1)
top_20_brands = brand_counts.head(20)
brand_names = []
for brand_id in top_20_brands.index:
    if brand_id == -1:
        brand_names.append(fix_persian_text("Unknown (ID=-1)"))
    else:
        brand_name = brands[brands['id'] == brand_id]['title'].values
        brand_name = brand_name[0][:30] if len(brand_name) > 0 else f"ID_{brand_id}"
        brand_names.append(fix_persian_text(brand_name))

axes[0, 2].barh(range(len(top_20_brands)), top_20_brands.values, color='darkgreen', alpha=0.7)
axes[0, 2].set_yticks(range(len(top_20_brands)))
axes[0, 2].set_yticklabels(brand_names, fontsize=9)
axes[0, 2].set_xlabel('Number of Products')
axes[0, 2].set_title('Top 20 Brands (Including ID=-1)')
axes[0, 2].invert_yaxis()
axes[0, 2].ticklabel_format(style='plain', axis='x')

# 4. Brand Distribution - Log Scale (All brands)
axes[1, 0].hist(brand_counts.values, bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 0].set_xlabel('Number of Products per Brand')
axes[1, 0].set_ylabel('Number of Brands')
axes[1, 0].set_title('Brand Size Distribution - All (Log Scale)')
axes[1, 0].set_yscale('log')
axes[1, 0].ticklabel_format(style='plain', axis='x')

# 5. Top 20 Brands Bar Chart (EXCLUDING ID=-1)
top_20_brands_filtered = brand_counts_filtered.head(20)
brand_names_filtered = []
for brand_id in top_20_brands_filtered.index:
    brand_name = brands[brands['id'] == brand_id]['title'].values
    brand_name = brand_name[0][:30] if len(brand_name) > 0 else f"ID_{brand_id}"
    brand_names_filtered.append(fix_persian_text(brand_name))

axes[1, 1].barh(range(len(top_20_brands_filtered)), top_20_brands_filtered.values, color='teal', alpha=0.7)
axes[1, 1].set_yticks(range(len(top_20_brands_filtered)))
axes[1, 1].set_yticklabels(brand_names_filtered, fontsize=9)
axes[1, 1].set_xlabel('Number of Products')
axes[1, 1].set_title('Top 20 Brands (Excluding ID=-1)')
axes[1, 1].invert_yaxis()
axes[1, 1].ticklabel_format(style='plain', axis='x')

# 6. Brand Distribution - Log Scale (EXCLUDING ID=-1)
axes[1, 2].hist(brand_counts_filtered.values, bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1, 2].set_xlabel('Number of Products per Brand')
axes[1, 2].set_ylabel('Number of Brands')
axes[1, 2].set_title('Brand Size Distribution - Excluding ID=-1 (Log Scale)')
axes[1, 2].set_yscale('log')
axes[1, 2].ticklabel_format(style='plain', axis='x')

plt.tight_layout()
plt.show()

# Summary
print(f"\n{'='*80}")
print("SUMMARY - CLASS IMBALANCE")
print("="*80)
print(f"\nüéØ KEY FINDINGS:")
print(f"   ‚úì Categories show {imbalance_ratio:,.0f}x imbalance (top vs bottom)")
print(f"   ‚úì Brands (all) show {brand_imbalance_ratio:,.0f}x imbalance (top vs bottom)")
print(f"   ‚úì Brands (excl. ID=-1) show {brand_imbalance_ratio_filtered:,.0f}x imbalance")
print(f"   ‚úì {(category_counts < 100).sum():,} categories have < 100 products")
print(f"   ‚úì {(brand_counts < 100).sum():,} brands (all) have < 100 products")
print(f"   ‚úì {(brand_counts_filtered < 100).sum():,} brands (excl. ID=-1) have < 100 products")
print(f"\n‚ö†Ô∏è  IMPLICATIONS:")
print(f"   - Highly imbalanced dataset may require special handling")
print(f"   - Consider grouping rare categories/brands or using class weights")
print(f"   - May need stratified sampling for train/test split")
print(f"   - {(base_products['brand_id'] == -1).sum()/len(base_products)*100:.1f}% of products have unknown brand (ID=-1)")