# ML Enhancement: In-Store Journey Analysis

Analyzes customer shopping paths using BLE beacon zone transitions to understand in-store behavior patterns.

## Data Flow
```
Silver (fact_zone_changes, fact_receipts) --> Gold (gold_journey_patterns)
```

## Business Value
- Optimize store layout based on traffic patterns
- Understand customer engagement by zone
- Identify high-value path sequences
- Correlate paths with purchase outcomes

## Outputs
- **gold_journey_patterns**: Top 20 common paths with conversion/basket metrics
- **gold_zone_transitions**: Zone transition probability matrix
- **gold_zone_dwell_stats**: Average dwell time statistics by zone

## Usage
Run this notebook **on-demand** or schedule periodically (e.g., daily) to update journey insights.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.utils import AnalysisException
from datetime import datetime, timezone, timedelta
import os

In [None]:
# =============================================================================
# PARAMETERS
# =============================================================================

def get_env(var_name, default=None):
    return os.environ.get(var_name, default)

SILVER_DB = get_env("SILVER_DB", default="ag")
GOLD_DB = get_env("GOLD_DB", default="au")

# Analysis window: last 30 days by default
ANALYSIS_DAYS = int(get_env("ANALYSIS_DAYS", default="30"))
MIN_PATH_LENGTH = int(get_env("MIN_PATH_LENGTH", default="2"))
TOP_N_PATHS = int(get_env("TOP_N_PATHS", default="20"))

print(f"Configuration:")
print(f"  SILVER_DB={SILVER_DB}")
print(f"  GOLD_DB={GOLD_DB}")
print(f"  ANALYSIS_DAYS={ANALYSIS_DAYS}")
print(f"  MIN_PATH_LENGTH={MIN_PATH_LENGTH}")
print(f"  TOP_N_PATHS={TOP_N_PATHS}")

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def save_gold(df, table_name):
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode("overwrite").saveAsTable(full_name)
    print(f"  {full_name}: {df.count()} rows")

def silver_exists(table_name):
    try:
        spark.table(f"{SILVER_DB}.{table_name}")
        return True
    except AnalysisException:
        return False

ensure_database(GOLD_DB)

In [None]:
print("="*60)
print("IN-STORE JOURNEY ANALYSIS")
print("="*60)

## Step 1: Reconstruct Customer Paths

Build sequential paths from `customer_zone_changed` events, grouping by customer session.

In [None]:
if not silver_exists("fact_zone_changes"):
    raise ValueError("fact_zone_changes table not found in Silver layer")

# Get zone change events for the analysis window
analysis_start = datetime.now(timezone.utc) - timedelta(days=ANALYSIS_DAYS)

df_zone_changes = (
    read_silver("fact_zone_changes")
    .filter(F.col("event_ts") >= F.lit(analysis_start))
    .select(
        "store_id",
        "customer_ble_id",
        "from_zone",
        "to_zone",
        "event_ts",
        "trace_id"
    )
)

print(f"Loaded {df_zone_changes.count()} zone change events from last {ANALYSIS_DAYS} days")

In [None]:
# Define sessions: group events by customer and store, with 30-minute session timeout
# Session ends if there's a gap > 30 minutes between events

window_by_customer = Window.partitionBy("store_id", "customer_ble_id").orderBy("event_ts")

df_with_sessions = (
    df_zone_changes
    .withColumn(
        "prev_event_ts",
        F.lag("event_ts").over(window_by_customer)
    )
    .withColumn(
        "time_gap_minutes",
        (F.unix_timestamp("event_ts") - F.unix_timestamp("prev_event_ts")) / 60
    )
    .withColumn(
        "is_new_session",
        F.when(
            (F.col("prev_event_ts").isNull()) | (F.col("time_gap_minutes") > 30),
            1
        ).otherwise(0)
    )
    .withColumn(
        "session_id",
        F.sum("is_new_session").over(
            window_by_customer.rowsBetween(Window.unboundedPreceding, Window.currentRow)
        )
    )
    .withColumn(
        "session_key",
        F.concat(
            F.col("store_id").cast("string"),
            F.lit("_"),
            F.col("customer_ble_id"),
            F.lit("_"),
            F.col("session_id").cast("string")
        )
    )
)

print(f"Identified {df_with_sessions.select('session_key').distinct().count()} unique customer sessions")

In [None]:
# Build paths: collect zones in order for each session
df_paths = (
    df_with_sessions
    .groupBy("session_key", "store_id", "customer_ble_id")
    .agg(
        F.collect_list(
            F.struct(
                F.col("event_ts"),
                F.col("to_zone")
            )
        ).alias("transitions"),
        F.min("event_ts").alias("session_start"),
        F.max("event_ts").alias("session_end")
    )
    .withColumn(
        "session_duration_minutes",
        (F.unix_timestamp("session_end") - F.unix_timestamp("session_start")) / 60
    )
    .withColumn(
        "path_length",
        F.size("transitions")
    )
    .filter(F.col("path_length") >= MIN_PATH_LENGTH)
)

# Extract zone sequence as array
df_paths = df_paths.withColumn(
    "zone_path",
    F.expr("transform(transitions, x -> x.to_zone)")
).withColumn(
    "path_string",
    F.array_join("zone_path", " -> ")
)

print(f"Built {df_paths.count()} customer paths (min length: {MIN_PATH_LENGTH} zones)")

## Step 2: Calculate Zone Dwell Times

Compute average time spent in each zone during customer sessions.

In [None]:
# Calculate dwell time per zone by looking at time between zone transitions
df_dwell_calc = (
    df_with_sessions
    .withColumn(
        "next_event_ts",
        F.lead("event_ts").over(
            Window.partitionBy("session_key").orderBy("event_ts")
        )
    )
    .withColumn(
        "dwell_seconds",
        F.when(
            F.col("next_event_ts").isNotNull(),
            F.unix_timestamp("next_event_ts") - F.unix_timestamp("event_ts")
        ).otherwise(None)
    )
    .filter(F.col("dwell_seconds").isNotNull())
    .filter(F.col("dwell_seconds") > 0)
    .filter(F.col("dwell_seconds") < 3600)  # Filter outliers > 1 hour
)

df_zone_dwell_stats = (
    df_dwell_calc
    .groupBy("store_id", "to_zone")
    .agg(
        F.avg("dwell_seconds").alias("avg_dwell_seconds"),
        F.expr("percentile_approx(dwell_seconds, 0.5)").alias("median_dwell_seconds"),
        F.min("dwell_seconds").alias("min_dwell_seconds"),
        F.max("dwell_seconds").alias("max_dwell_seconds"),
        F.count("*").alias("visit_count")
    )
    .withColumnRenamed("to_zone", "zone")
    .withColumn("computed_at", F.lit(datetime.now(timezone.utc)))
)

print("Creating gold_zone_dwell_stats...")
save_gold(df_zone_dwell_stats, "gold_zone_dwell_stats")

## Step 3: Calculate Zone Transition Probabilities

Build a transition matrix showing the probability of moving from one zone to another.

In [None]:
# Count transitions between zones
df_transitions = (
    df_zone_changes
    .groupBy("store_id", "from_zone", "to_zone")
    .agg(F.count("*").alias("transition_count"))
)

# Calculate total transitions from each zone
df_from_totals = (
    df_transitions
    .groupBy("store_id", "from_zone")
    .agg(F.sum("transition_count").alias("total_from_zone"))
)

# Join and calculate probabilities
df_zone_transitions = (
    df_transitions
    .join(
        df_from_totals,
        on=["store_id", "from_zone"],
        how="inner"
    )
    .withColumn(
        "transition_probability",
        F.col("transition_count") / F.col("total_from_zone")
    )
    .withColumn("computed_at", F.lit(datetime.now(timezone.utc)))
    .select(
        "store_id",
        "from_zone",
        "to_zone",
        "transition_count",
        "transition_probability",
        "computed_at"
    )
    .orderBy("store_id", "from_zone", F.desc("transition_probability"))
)

print("Creating gold_zone_transitions...")
save_gold(df_zone_transitions, "gold_zone_transitions")

## Step 4: Identify Common Path Patterns

Find the top N most frequent customer paths through the store.

In [None]:
# Count path frequency
df_path_frequency = (
    df_paths
    .groupBy("store_id", "path_string", "path_length")
    .agg(
        F.count("*").alias("occurrence_count"),
        F.avg("session_duration_minutes").alias("avg_session_duration_minutes")
    )
)

# Rank paths by frequency per store
window_by_store = Window.partitionBy("store_id").orderBy(F.desc("occurrence_count"))

df_top_paths = (
    df_path_frequency
    .withColumn("rank", F.row_number().over(window_by_store))
    .filter(F.col("rank") <= TOP_N_PATHS)
)

print(f"Identified top {TOP_N_PATHS} paths per store")

## Step 5: Correlate Paths with Purchase Outcomes

Join customer paths with receipt data to understand conversion and basket size by journey.

In [None]:
# Check if receipts are available
if silver_exists("fact_receipts"):
    # Get receipts within analysis window
    df_receipts = (
        read_silver("fact_receipts")
        .filter(F.col("event_ts") >= F.lit(analysis_start))
        .select(
            "store_id",
            "customer_id",
            "event_ts",
            "total_cents",
            "receipt_id_ext"
        )
    )
    
    # Note: Receipts use customer_id, zone changes use customer_ble_id
    # We'll correlate by store + time window (same store, purchase within session timeframe)
    
    # Create session-to-receipt mapping by joining on store and time proximity
    df_path_receipts = (
        df_paths
        .join(
            df_receipts,
            on=[
                (df_paths.store_id == df_receipts.store_id) &
                (df_receipts.event_ts >= df_paths.session_start) &
                (df_receipts.event_ts <= df_paths.session_end + F.expr("INTERVAL 5 MINUTES"))
            ],
            how="left"
        )
        .select(
            df_paths.store_id,
            df_paths.session_key,
            df_paths.path_string,
            df_paths.path_length,
            df_paths.session_duration_minutes,
            df_receipts.receipt_id_ext,
            df_receipts.total_cents
        )
    )
    
    # Aggregate path performance metrics
    df_path_metrics = (
        df_path_receipts
        .groupBy("store_id", "path_string", "path_length")
        .agg(
            F.count("session_key").alias("total_sessions"),
            F.sum(
                F.when(F.col("receipt_id_ext").isNotNull(), 1).otherwise(0)
            ).alias("sessions_with_purchase"),
            F.avg("session_duration_minutes").alias("avg_session_duration_minutes"),
            F.avg(
                F.when(F.col("total_cents").isNotNull(), F.col("total_cents")).otherwise(None)
            ).alias("avg_basket_cents"),
            F.sum(
                F.when(F.col("total_cents").isNotNull(), F.col("total_cents")).otherwise(0)
            ).alias("total_revenue_cents")
        )
        .withColumn(
            "conversion_rate",
            F.col("sessions_with_purchase") / F.col("total_sessions")
        )
    )
    
    # Join with top paths
    df_journey_patterns = (
        df_top_paths
        .join(
            df_path_metrics,
            on=["store_id", "path_string", "path_length"],
            how="left"
        )
        .withColumn("computed_at", F.lit(datetime.now(timezone.utc)))
        .withColumn("analysis_period_days", F.lit(ANALYSIS_DAYS))
        .select(
            "store_id",
            "rank",
            "path_string",
            "path_length",
            "occurrence_count",
            "total_sessions",
            "sessions_with_purchase",
            "conversion_rate",
            "avg_session_duration_minutes",
            "avg_basket_cents",
            "total_revenue_cents",
            "analysis_period_days",
            "computed_at"
        )
        .orderBy("store_id", "rank")
    )
    
else:
    # No receipts available, just output path frequency
    print("Warning: fact_receipts not found, skipping conversion analysis")
    df_journey_patterns = (
        df_top_paths
        .withColumn("total_sessions", F.col("occurrence_count"))
        .withColumn("sessions_with_purchase", F.lit(None).cast("long"))
        .withColumn("conversion_rate", F.lit(None).cast("double"))
        .withColumn("avg_basket_cents", F.lit(None).cast("double"))
        .withColumn("total_revenue_cents", F.lit(None).cast("long"))
        .withColumn("computed_at", F.lit(datetime.now(timezone.utc)))
        .withColumn("analysis_period_days", F.lit(ANALYSIS_DAYS))
        .select(
            "store_id",
            "rank",
            "path_string",
            "path_length",
            "occurrence_count",
            "total_sessions",
            "sessions_with_purchase",
            "conversion_rate",
            "avg_session_duration_minutes",
            "avg_basket_cents",
            "total_revenue_cents",
            "analysis_period_days",
            "computed_at"
        )
        .orderBy("store_id", "rank")
    )

print("Creating gold_journey_patterns...")
save_gold(df_journey_patterns, "gold_journey_patterns")

## Summary

Display key insights from the journey analysis.

In [None]:
print("\n" + "="*60)
print("JOURNEY ANALYSIS COMPLETE")
print("="*60)

print(f"\nOutput Tables:")
print(f"  {GOLD_DB}.gold_journey_patterns - Top {TOP_N_PATHS} paths per store with conversion metrics")
print(f"  {GOLD_DB}.gold_zone_transitions - Zone transition probability matrix")
print(f"  {GOLD_DB}.gold_zone_dwell_stats - Zone dwell time statistics")

print(f"\nSample Results (Top 5 Paths):")
df_journey_patterns.limit(5).show(truncate=False)

print(f"\nAnalysis Period: Last {ANALYSIS_DAYS} days")
print(f"Computed At: {datetime.now(timezone.utc)}")