In [1]:
from pymongo import MongoClient
import pandas as pd

In [None]:
from prisma import Prisma

db = Prisma()
await db.connect()

In [None]:
def get_database():

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb://localhost:27017"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client["eezeeDb"]

In [3]:
dbname = get_database()
product_collection = dbname["products"]

## Key Product Attributes:
    - Title
    - Category
    - Brand
    - Model
    - Description
    - Specifications

In [None]:
# Dataset Characteristics Analysis
print("=== DATASET SCALE ===")

# Total number of products (all products)
total_products_all = product_collection.estimated_document_count()
print(f"Total products in database: {total_products_all:,}")

# Active and visible products
total_active_visible = product_collection.count_documents({
    "active": {"$eq": True},
    "hiddenFromSearchEngine": {"$eq": False}
})
print(f"Active and visible products: {total_active_visible:,}")

# Let's examine the structure of a sample document to understand the schema
sample_doc = product_collection.find_one()
print(
    f"\nSample document fields: {list(sample_doc.keys()) if sample_doc else 'No documents found'}")

In [None]:
# Let's explore the document structure more deeply
if sample_doc:
    print("=== DOCUMENT STRUCTURE ANALYSIS ===")
    print(f"Document ID type: {type(sample_doc.get('_id'))}")

    # Check for common category fields
    category_fields = []
    for field in sample_doc.keys():
        if 'categor' in field.lower() or 'type' in field.lower() or 'class' in field.lower():
            category_fields.append(field)

    print(f"Potential category fields: {category_fields}")

    # Display sample values for potential category fields
    for field in category_fields[:3]:  # Show first 3 category fields
        value = sample_doc.get(field)
        print(f"Sample {field}: {value} (type: {type(value)})")

    # Check for date/time fields
    date_fields = []
    for field in sample_doc.keys():
        if any(word in field.lower() for word in ['date', 'time', 'created', 'updated', 'modified']):
            date_fields.append(field)

    print(f"\nPotential date/time fields: {date_fields}")

    # Display sample values for date fields
    for field in date_fields[:3]:  # Show first 3 date fields
        value = sample_doc.get(field)
        print(f"Sample {field}: {value} (type: {type(value)})")

    # Show data types distribution
    print(f"\n=== DATA FORMAT ANALYSIS ===")
    field_types = {}
    for key, value in sample_doc.items():
        field_types[key] = type(value).__name__

    # Count field types
    type_counts = {}
    for field_type in field_types.values():
        type_counts[field_type] = type_counts.get(field_type, 0) + 1

    print("Field type distribution:")
    for data_type, count in sorted(type_counts.items()):
        print(f"  {data_type}: {count} fields")

In [None]:
# Domain Coverage Analysis - Product Categories
print("=== DOMAIN COVERAGE ===")

# Main categories analysis
main_categories = product_collection.distinct("categories.main")
print(f"Number of main categories: {len(main_categories)}")
print("Main categories:")
for i, category in enumerate(sorted(main_categories)[:10], 1):  # Show first 10
    print(f"  {i}. {category}")
if len(main_categories) > 10:
    print(f"  ... and {len(main_categories) - 10} more")

# Sub-categories analysis
sub_categories = product_collection.distinct("categories.sub")
print(f"\nNumber of sub-categories: {len(sub_categories)}")

# Category distribution (top 10 main categories by product count)
print(f"\n=== TOP PRODUCT CATEGORIES ===")
pipeline = [
    {"$group": {"_id": "$categories.main", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 10}
]
top_categories = list(product_collection.aggregate(pipeline))
for i, cat in enumerate(top_categories, 1):
    print(f"{i}. {cat['_id']}: {cat['count']:,} products")

# Brand analysis
print(f"\n=== BRAND COVERAGE ===")
total_brands = len(product_collection.distinct("brand.name"))
print(f"Total unique brands: {total_brands:,}")

# Top brands by product count
brand_pipeline = [
    {"$group": {"_id": "$brand.name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 10}
]
top_brands = list(product_collection.aggregate(brand_pipeline))
print("Top 10 brands by product count:")
for i, brand in enumerate(top_brands, 1):
    print(f"  {i}. {brand['_id']}: {brand['count']:,} products")

In [None]:
# Time Span and Data Freshness Analysis
from datetime import datetime, timedelta
print("=== TIME SPAN & DATA FRESHNESS ===")

# Date range analysis
date_pipeline = [
    {
        "$project": {
            "dateCreated": 1,
            "dateUpdated": 1,
            "year_created": {"$year": "$dateCreated"},
            "year_updated": {"$year": "$dateUpdated"}
        }
    },
    {
        "$group": {
            "_id": None,
            "earliest_created": {"$min": "$dateCreated"},
            "latest_created": {"$max": "$dateCreated"},
            "earliest_updated": {"$min": "$dateUpdated"},
            "latest_updated": {"$max": "$dateUpdated"},
            "min_year_created": {"$min": "$year_created"},
            "max_year_created": {"$max": "$year_created"},
            "min_year_updated": {"$min": "$year_updated"},
            "max_year_updated": {"$max": "$year_updated"}
        }
    }
]

date_stats = list(product_collection.aggregate(date_pipeline))
if date_stats:
    stats = date_stats[0]
    print(f"Data collection period:")
    print(
        f"  Created: {stats['earliest_created'].strftime('%Y-%m-%d')} to {stats['latest_created'].strftime('%Y-%m-%d')}")
    print(
        f"  Updated: {stats['earliest_updated'].strftime('%Y-%m-%d')} to {stats['latest_updated'].strftime('%Y-%m-%d')}")
    print(
        f"  Span: {stats['max_year_created'] - stats['min_year_created'] + 1} years of data creation")

# Products by creation year
year_pipeline = [
    {
        "$project": {
            "year_created": {"$year": "$dateCreated"}
        }
    },
    {
        "$group": {
            "_id": "$year_created",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {"_id": 1}
    }
]

products_by_year = list(product_collection.aggregate(year_pipeline))
print(f"\nProducts created by year:")
for entry in products_by_year:
    if entry['_id']:  # Skip null years
        print(f"  {entry['_id']}: {entry['count']:,} products")

# Recent activity (products updated in last year)
one_year_ago = datetime.now() - timedelta(days=365)

recent_updates = product_collection.count_documents({
    "dateUpdated": {"$gte": one_year_ago}
})
print(f"\nProducts updated in last 365 days: {recent_updates:,}")

# Activity status
active_products = product_collection.count_documents({"active": True})
inactive_products = product_collection.count_documents({"active": False})
print(f"\nProduct status:")
print(f"  Active: {active_products:,}")
print(f"  Inactive: {inactive_products:,}")

In [None]:
# Data Format and Structure Analysis
print("=== DATA FORMAT ANALYSIS ===")

# Sample multiple documents to get better field coverage
sample_docs = list(product_collection.find().limit(100))
all_fields = set()
field_types = {}
field_presence = {}

for doc in sample_docs:
    for key, value in doc.items():
        all_fields.add(key)
        field_type = type(value).__name__

        # Track field types
        if key not in field_types:
            field_types[key] = {}
        field_types[key][field_type] = field_types[key].get(field_type, 0) + 1

        # Track field presence
        field_presence[key] = field_presence.get(key, 0) + 1

print(
    f"Total unique fields across {len(sample_docs)} sample documents: {len(all_fields)}")

# Analyze complex data structures
complex_fields = {}
for doc in sample_docs[:10]:  # Analyze first 10 docs in detail
    for key, value in doc.items():
        if isinstance(value, dict):
            complex_fields[key] = f"Object with {len(value)} keys"
        elif isinstance(value, list) and value:
            if isinstance(value[0], dict):
                complex_fields[key] = f"Array of objects ({len(value)} items)"
            else:
                complex_fields[key] = f"Array of {type(value[0]).__name__} ({len(value)} items)"

print(f"\nComplex data structures (JSON objects/arrays):")
for field, description in complex_fields.items():
    print(f"  {field}: {description}")

# Multi-language support analysis
multilang_fields = [field for field in all_fields if 'translation' in field.lower(
) or field.endswith('Translations')]
print(f"\nMulti-language fields: {len(multilang_fields)}")
for field in multilang_fields:
    print(f"  {field}")

# Currency and pricing structure
sample_price = sample_docs[0].get('price', {})
if sample_price:
    currencies = [key for key in sample_price.keys() if key not in [
        'original', 'po', 'baseCurrency', '_id']]
    print(f"\nSupported currencies: {len(currencies)}")
    print(f"  {', '.join(currencies)}")

# Field presence analysis (fields that appear in most documents)
print(f"\nCore fields (present in >90% of sample documents):")
core_fields = [field for field, count in field_presence.items()
               if count > len(sample_docs) * 0.9]
for field in sorted(core_fields):
    percentage = (field_presence[field] / len(sample_docs)) * 100
    print(f"  {field}: {percentage:.1f}%")

print(f"\nData format summary:")
print(f"  - Semi-structured JSON documents")
print(f"  - Nested objects for complex data (categories, pricing, variants)")
print(f"  - Multi-language support with translation fields")
print(f"  - Rich metadata (images, specifications, shipping info)")
print(f"  - Multi-currency pricing structure")

In [None]:
# Find all MongoDB products that have specifications
# Based on the sample document, specifications are stored in the 'specs' field

print("=== PRODUCTS WITH SPECIFICATIONS ===")

# Count products with specifications
products_with_specs = product_collection.count_documents({
    "specs": {"$exists": True, "$ne": [], "$not": {"$size": 0}}
})

total_products = product_collection.estimated_document_count()
percentage_with_specs = (products_with_specs / total_products) * 100

print(f"Products with specifications: {products_with_specs:,}")
print(f"Total products: {total_products:,}")
print(f"Percentage with specs: {percentage_with_specs:.1f}%")

# Get sample products with specifications
print(f"\n=== SAMPLE PRODUCTS WITH SPECIFICATIONS ===")
sample_products_with_specs = list(product_collection.find(
    {"specs": {"$exists": True, "$ne": [], "$not": {"$size": 0}}},
    {"_id": 1, "title": 1, "brand.name": 1, "categories.main": 1, "specs": 1}
).limit(5))

for i, product in enumerate(sample_products_with_specs, 1):
    print(f"\n{i}. {product.get('title', 'No title')}")
    print(f"   Brand: {product.get('brand', {}).get('name', 'No brand')}")
    print(f"   Category: {product.get('categories', {}).get('main', 'No category')}")
    print(f"   Specifications ({len(product.get('specs', []))}):")
    
    specs = product.get('specs', [])
    for spec in specs[:3]:  # Show first 3 specs
        print(f"     - {spec.get('name', 'No name')}: {spec.get('value', 'No value')}")
    
    if len(specs) > 3:
        print(f"     ... and {len(specs) - 3} more specifications")

# Analyze specification patterns
print(f"\n=== SPECIFICATION ANALYSIS ===")

# Get all unique specification names
spec_names_pipeline = [
    {"$match": {"specs": {"$exists": True, "$ne": []}}},
    {"$unwind": "$specs"},
    {"$group": {"_id": "$specs.name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 10}
]

common_spec_names = list(product_collection.aggregate(spec_names_pipeline))
print("Most common specification names:")
for i, spec in enumerate(common_spec_names, 1):
    print(f"  {i}. {spec['_id']}: {spec['count']:,} products")

# Categories with most specifications
categories_with_specs_pipeline = [
    {"$match": {"specs": {"$exists": True, "$ne": []}}},
    {"$project": {
        "categories.main": 1,
        "spec_count": {"$size": "$specs"}   
    }},
    {"$group": {
        "_id": "$categories.main",
        "product_count": {"$sum": 1},
        "avg_specs_per_product": {"$avg": "$spec_count"}
    }},
    {"$sort": {"product_count": -1}},
    {"$limit": 10}
]

categories_with_specs = list(product_collection.aggregate(categories_with_specs_pipeline))
print(f"\nCategories with most specification-enabled products:")
for i, cat in enumerate(categories_with_specs, 1):
    print(f"  {i}. {cat['_id']}: {cat['product_count']:,} products (avg {cat['avg_specs_per_product']:.1f} specs/product)")