# Smart Visual Commerce Platform - Complete Demo

This notebook demonstrates all features of the platform:
1. Data Loading & EDA
2. Visual Search
3. Attribute Extraction
4. Quality Assessment
5. Scene Understanding
6. Recommendations
7. Evaluation Metrics

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from src.data.loader import DatasetLoader
from src.data.preprocessor import ImagePreprocessor
from src.models.embeddings import CLIPEmbedder
from src.models.search import VisualSearchEngine
from src.models.vlm_client import VLMClient
from src.models.recommender import RecommendationEngine
from src.evaluation.metrics import SearchMetrics, evaluate_search_system

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore Dataset

In [None]:
# Initialize data loader
loader = DatasetLoader()

# Create sample dataset
df = loader.create_sample_dataset(num_samples=100)

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Get dataset statistics
stats = loader.get_statistics(df)

print("\nDataset Statistics:")
print(f"Total Products: {stats['total_products']}")
print(f"\nPrice Range: ${stats['price_stats']['min']:.2f} - ${stats['price_stats']['max']:.2f}")
print(f"Average Price: ${stats['price_stats']['mean']:.2f}")

In [None]:
# Visualize distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Category distribution
df['category'].value_counts().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Products by Category')
axes[0, 0].set_xlabel('Category')
axes[0, 0].set_ylabel('Count')

# Color distribution
df['baseColour'].value_counts().plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Products by Color')
axes[0, 1].set_xlabel('Color')
axes[0, 1].set_ylabel('Count')

# Gender distribution
df['gender'].value_counts().plot(kind='pie', ax=axes[1, 0], autopct='%1.1f%%')
axes[1, 0].set_title('Products by Gender')

# Price distribution
df['price'].hist(bins=20, ax=axes[1, 1], edgecolor='black')
axes[1, 1].set_title('Price Distribution')
axes[1, 1].set_xlabel('Price ($)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 2. Initialize Models

In [None]:
# Initialize CLIP embedder
print("Loading CLIP model...")
embedder = CLIPEmbedder(model_name="ViT-B/32")
print("✓ CLIP loaded")

In [None]:
# Initialize VLM client (optional - requires API key)
# Uncomment if you have API keys set up
# vlm_client = VLMClient(provider="openai", model="gpt-4o-mini")
print("Note: VLM client requires API keys. Set OPENAI_API_KEY in .env file.")

## 3. Generate Embeddings for Visual Search

In [None]:
# For demo, we'll generate embeddings from text descriptions
# In production, you'd use actual product images

print("Generating embeddings from product descriptions...")
descriptions = df.apply(
    lambda x: f"{x['category']} {x['baseColour']} {x['pattern']} for {x['gender']}",
    axis=1
).tolist()

embeddings = embedder.encode_text(descriptions)
print(f"✓ Generated {len(embeddings)} embeddings with shape {embeddings.shape}")

## 4. Build Visual Search Engine

In [None]:
# Initialize search engine
search_engine = VisualSearchEngine(embedding_dim=embeddings.shape[1])

# Build index
search_engine.build_index(
    embeddings=embeddings,
    product_ids=df['id'].tolist(),
    metadata=df
)

print("✓ Search engine ready!")
print(search_engine.get_statistics())

## 5. Test Visual Search

In [None]:
# Search by text query
query = "red dress for women"
query_embedding = embedder.encode_text(query)

results = search_engine.search(query_embedding, top_k=5)

print(f"Search results for: '{query}'\n")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['productDisplayName']}")
    print(f"   Category: {result['category']} | Color: {result['baseColour']}")
    print(f"   Price: ${result['price']:.2f} | Similarity: {result['similarity']:.3f}\n")

In [None]:
# Search with filters
filtered_results = search_engine.search_with_filters(
    query_embedding,
    top_k=5,
    filters={'category': 'Dress', 'gender': 'Women'}
)

print(f"Filtered search results (Dresses for Women):\n")
for i, result in enumerate(filtered_results, 1):
    print(f"{i}. {result['productDisplayName']} - ${result['price']:.2f}")

## 6. Recommendation System

In [None]:
# Initialize recommendation engine
recommender = RecommendationEngine(df)

# Get a sample product
sample_product_id = df.iloc[0]['id']
sample_product = df[df['id'] == sample_product_id].iloc[0]

print(f"Source Product: {sample_product['productDisplayName']}")
print(f"Category: {sample_product['category']} | Color: {sample_product['baseColour']}\n")

In [None]:
# Get content-based recommendations
content_recs = recommender.content_based_recommendations(
    sample_product_id,
    embeddings,
    df['id'].tolist(),
    top_n=5
)

print("Content-Based Recommendations (Similar Products):\n")
for i, rec in enumerate(content_recs, 1):
    print(f"{i}. {rec['productDisplayName']}")
    print(f"   Similarity: {rec['similarity']:.3f} | Reason: {rec['reason']}\n")

In [None]:
# Get complementary recommendations
comp_recs = recommender.complementary_recommendations(
    sample_product_id,
    top_n=5
)

print("Complementary Recommendations (Complete the Look):\n")
for i, rec in enumerate(comp_recs, 1):
    print(f"{i}. {rec['productDisplayName']}")
    print(f"   Category: {rec['category']} | Reason: {rec['reason']}\n")

In [None]:
# Get hybrid recommendations
hybrid_recs = recommender.hybrid_recommendations(
    sample_product_id,
    embeddings,
    df['id'].tolist(),
    top_n=5
)

print("Hybrid Recommendations (Best Overall):\n")
for i, rec in enumerate(hybrid_recs, 1):
    print(f"{i}. {rec['productDisplayName']}")
    print(f"   Score: {rec['recommendation_score']:.3f}")
    print(f"   Sources: {rec['recommendation_sources']}\n")

## 7. Evaluation Metrics

In [None]:
# Create mock ground truth for evaluation
# In production, this would come from user interactions/clicks

relevance_dict = {}
results_dict = {}

# Simulate 10 queries
for i in range(10):
    query_id = f"query_{i}"
    
    # Random query
    category = np.random.choice(df['category'].unique())
    query_text = f"{category}"
    query_emb = embedder.encode_text(query_text)
    
    # Get search results
    results = search_engine.search(query_emb, top_k=20)
    results_dict[query_id] = [r['id'] for r in results]
    
    # Mock relevance (products in same category are relevant)
    relevant = set(df[df['category'] == category]['id'].tolist()[:10])
    relevance_dict[query_id] = relevant

print("Created evaluation data for 10 queries")

In [None]:
# Evaluate search system
metrics = evaluate_search_system(
    relevance_dict,
    results_dict,
    k_values=[1, 5, 10, 20]
)

print("Search System Evaluation Metrics:\n")
print(f"MAP (Mean Average Precision): {metrics['MAP']:.4f}")
print(f"MRR (Mean Reciprocal Rank): {metrics['MRR']:.4f}")
print("\nMetrics at Different K:")
for k in [1, 5, 10, 20]:
    print(f"\nK={k}:")
    print(f"  Precision@{k}: {metrics[f'Precision@{k}']:.4f}")
    print(f"  Recall@{k}: {metrics[f'Recall@{k}']:.4f}")
    print(f"  NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")

In [None]:
# Visualize metrics
k_values = [1, 5, 10, 20]
precision_values = [metrics[f'Precision@{k}'] for k in k_values]
recall_values = [metrics[f'Recall@{k}'] for k in k_values]
ndcg_values = [metrics[f'NDCG@{k}'] for k in k_values]

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(k_values, precision_values, marker='o', label='Precision@K', linewidth=2)
ax.plot(k_values, recall_values, marker='s', label='Recall@K', linewidth=2)
ax.plot(k_values, ndcg_values, marker='^', label='NDCG@K', linewidth=2)

ax.set_xlabel('K', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Search Metrics at Different K Values', fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Summary

This notebook demonstrated:

✅ **Data Loading & EDA**: Created sample dataset with 100 products  
✅ **Visual Search**: CLIP-based semantic search with FAISS  
✅ **Recommendations**: Content-based, attribute-based, and hybrid  
✅ **Evaluation**: Comprehensive metrics (MAP, MRR, Precision@K, Recall@K, NDCG@K)  

### Next Steps:

1. **Add VLM Integration**: Set up API keys for GPT-4V or Gemini
2. **Real Images**: Use actual product images instead of text embeddings
3. **Scale Up**: Test with larger datasets (10K+ products)
4. **Deploy**: Use the Streamlit app or FastAPI for production
5. **Fine-tune**: Customize models for specific product categories