# Exploratory Data Analysis

This notebook provides initial exploration of the e-commerce event data and funnel metrics.

In [None]:
import pandas as pd
import pathlib as p
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Paths
ARTIFACTS = p.Path("../artifacts")
DATA_RAW = p.Path("../data/raw")

print("Setup complete!")

## 1. Data Profiling

Load and profile the funnel session data.

In [None]:
# Load funnel session data
funnel_session = pd.read_csv(ARTIFACTS / "funnel_session.csv")
print(f"Shape: {funnel_session.shape}")
print(f"\nColumns: {list(funnel_session.columns)}")
funnel_session.head()

In [None]:
# Summary statistics
funnel_session.describe()

## 2. Event Distribution and Conversion Rates

In [None]:
# Calculate event counts and conversion rates
total_views = funnel_session['has_view'].sum()
total_carts = funnel_session['has_cart'].sum()
total_purchases = funnel_session['has_purchase'].sum()

print(f"Total Sessions with Views: {total_views:,}")
print(f"Total Sessions with Carts: {total_carts:,}")
print(f"Total Sessions with Purchases: {total_purchases:,}")

if total_views > 0:
    view_to_cart = (total_carts / total_views) * 100
    print(f"\nView-to-Cart Rate: {view_to_cart:.2f}%")

if total_carts > 0:
    cart_to_purchase = (total_purchases / total_carts) * 100
    print(f"Cart-to-Purchase Rate: {cart_to_purchase:.2f}%")

In [None]:
# Visualize funnel
fig, ax = plt.subplots(figsize=(8, 6))
events = ['Views', 'Add to Cart', 'Purchase']
counts = [total_views, total_carts, total_purchases]
ax.bar(events, counts, color=['#3498db', '#e74c3c', '#2ecc71'])
ax.set_ylabel('Number of Sessions')
ax.set_title('Funnel Conversion')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. SKU Drop-off Analysis

In [None]:
# Load SKU drop-off data
try:
    sku_dropoff = pd.read_csv(ARTIFACTS / "sku_dropoff.csv")
    print(f"SKUs analyzed: {len(sku_dropoff)}")
    print(f"\nTop 10 lowest converting SKUs:")
    sku_dropoff.head(10)
except FileNotFoundError:
    print("SKU drop-off analysis not yet run. Execute: python src/run_analytics.py")

## 4. Cohort Retention Analysis

In [None]:
# Load cohort retention data
try:
    cohort_retention = pd.read_csv(ARTIFACTS / "cohort_retention.csv")
    print(f"Cohort retention data loaded: {len(cohort_retention)} rows")
    cohort_retention.head()
except FileNotFoundError:
    print("Cohort retention analysis not yet run. Execute: python src/run_analytics.py")