In [32]:
import pandas as pd
import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import glob


In [22]:
files = glob.glob("data/raw/consumer_edge/*.csv*")

In [23]:
brands = set()
for f in files:
    for chunk in pd.read_csv(
        f,
        usecols=["BRAND_NAME"],
        chunksize=200_000,
        compression="infer",
    ):
        brands.update(chunk["BRAND_NAME"].dropna().unique())

In [24]:
brands = sorted(brands)
len(brands), brands[:50]  # count + preview

(13519,
 ['& OTHER STORIES',
  '&PIZZA',
  '1 800 ANYLENS',
  '1 HOTELS',
  '1 STOP (EX FUEL)',
  '1 STOP (FUEL)',
  '1 STOP BEDROOMS',
  '1 STOP LIGHTING',
  '1&1 MAIL & MEDIA',
  '1-800 CONTACTS',
  '1-800-FLOWERS',
  '1-800-GOT-JUNK?',
  '1-800-PETMEDS',
  '1-800-PETSUPPLIES.COM',
  '1-800-PLUMBER+AIR',
  '1-800-RADIATOR',
  '100 PERCENT',
  '100 PERCENT PURE',
  '100 THIEVES',
  '1000BULBS.COM',
  '11 HONORE',
  '110 GRILL',
  '111 SKIN',
  '123 REG',
  '123HELPME.COM',
  '123INKJETS',
  '12TH TRIBE CLOTHING',
  '1771 LIVING',
  '17HATS',
  '1800BASKETS.COM',
  '1800LIGHTING.COM',
  '1800MATTRESS',
  '1822 DENIM',
  '1A AUTO',
  '1PASSWORD',
  '1ST JACKPOT CASINO',
  '1ST PHORM',
  '1STDIBS',
  '21 CLUB',
  '21ST CENTURY ENERGY GROUP',
  '21ST CENTURY INSURANCE',
  '23ANDME',
  '24 HOUR FITNESS',
  '27 DRESS',
  '2MODERN',
  '2ND AVE',
  '2ND SWING',
  '2ULAUNDRY',
  '2XIST',
  '3 DAY BLINDS'])

### lead/lag correlation 
- If series X "leads" series Y, that means that changes in X tend to happen BEFORE changes in Y
- If series X "lags" series Y, that means that changes in X tend to happen AFTER changes in Y
- Here we compare X (today) with Y (10 days in the future) --> If coorelation strong, X might lead Y
- Here we compare X (today) with Y (10 days in the PAST) --> If correlation strong X might lag Y

In [None]:
# Load consumer Edge daily spend data
spend = pd.read_csv("data/clean_spend_daily.csv", parse_dates=["date"])

# Load stock prices 
stocks = pd.read_csv("data/market_data_stocks.csv", parse_dates=["Date"]).rename(columns={"Date": "date"})

In [38]:
spend.head()
# spend['brand'].unique()

Unnamed: 0,brand,date,spend,transactions,avg_ticket_size
0,BURGER KING,2018-01-01,929955.15,81548.31,11.403733
1,BURGER KING,2018-01-02,1203701.94,125098.16,9.62206
2,BURGER KING,2018-01-03,1287404.29,137120.19,9.388875
3,BURGER KING,2018-01-04,1237328.54,130696.37,9.467199
4,BURGER KING,2018-01-05,1492856.05,154003.06,9.693678


In [35]:
stocks.head()

Unnamed: 0,date,CMG,DPZ,MCD,SBUX,TGT
0,2023-01-03,27.4226,324.958191,246.610352,93.758347,136.709137
1,2023-01-04,27.521,329.862274,246.666443,97.133751,137.601151
2,2023-01-05,27.283199,324.536774,244.585861,97.105865,138.988693
3,2023-01-06,28.041201,328.166931,251.405853,99.207359,144.295578
4,2023-01-09,28.934799,319.795624,249.334656,97.394104,140.87178


In [None]:
# Map Consumer Edge brand names to stock tickers
BRAND_TICKER = {
    "STARBUCKS (MERCHANT)": "SBUX",
    "STARBUCKS CARD": "SBUX",
    "MCDONALD'S": "MCD",
    "CHIPOTLE MEXICAN": "CMG",
    "DOMINO'S PIZZA": "DPZ",
}

In [None]:
# Only keep pairs that exist in both datasets
available = {
    b: t for b, t in BRAND_TICKER.items()
    if b in spend["brand"].unique() and t in stocks.columns
}
available

{'STARBUCKS (MERCHANT)': 'SBUX',
 'STARBUCKS CARD': 'SBUX',
 "MCDONALD'S": 'MCD',
 'CHIPOTLE MEXICAN': 'CMG',
 "DOMINO'S PIZZA": 'DPZ'}

In [None]:
# 2. Build spend signal + stock returns

# Pick brand to analyze 
brand = "MCDONALD'S"
ticker = available[brand]

# Filter spend data to just this brand
brand_df = spend[spend["brand"] == brand].sort_values("date").copy()

# Smooth spend with a 7-day rolling sum (reduces day-to-day noise)
brand_df["spend_7d"] = brand_df["spend"].rolling(7, min_periods=7).sum()

# Year-over-year spend change (YoY) = (this year / last year) - 1
# This is the "signal" we'll compare to stock returns
brand_df["spend_yoy"] = brand_df["spend_7d"] / brand_df["spend_7d"].shift(365) - 1

# Build stock returns:
stock_df = stocks[["date", ticker]].sort_values("date").copy()
stock_df["ret_1d"] = stock_df[ticker].pct_change() # 1-day return
stock_df["ret_5d"] = stock_df[ticker].pct_change(5) # 5-day return

# Merge spend & stock data on DATE
df = (
    brand_df[["date", "spend_yoy"]]
    .merge(stock_df[["date", "ret_1d", "ret_5d"]], on="date", how="inner")
    .dropna()
)
df.head()

Unnamed: 0,date,spend_yoy,ret_1d,ret_5d
5,2023-01-10,0.181027,0.01203,0.051274
6,2023-01-11,0.188034,0.00283,0.017614
7,2023-01-12,0.186998,-0.004233,0.013597
8,2023-01-13,0.180732,0.013037,0.005061
9,2023-01-17,0.187052,-0.004476,0.019191


In [None]:
# 3. Lead/lag correlation scan + plot
def lead_lag_corr(x, y, max_lag=60):
    """
    Measure correlation between x and y at different lags.
    lag = +10 means compare x today vs y 10 days in the future
    lag = -10 means compare x today vs y 10 days in the past
    """
    lags = range(-max_lag, max_lag + 1)
    corrs = []
    for lag in lags:
        # y.shift(-lag) = move y backward for positive lag
        # Ex. lag=+10 -> y.shift(-10) = y moved UP in time (future values aligned to today)
        corrs.append(x.corr(y.shift(-lag)))
    return pd.DataFrame({"lag": list(lags), "corr": corrs})

# Compare spend Yoy to 5-day stock returns 
corrs = lead_lag_corr(df["spend_yoy"], df["ret_5d"], max_lag=60)

# find the lag with the strongest (absolute) correlation
best = corrs.iloc[corrs["corr"].abs().idxmax()]
best


lag    -8.000000
corr   -0.088422
Name: 52, dtype: float64

In [None]:
# 4. Plot correlation by lag
fig = px.line(
    corrs, x="lag", y="corr",
    title=f"{brand} vs {ticker}: lead/lag correlation (spend_yoy vs 5d returns)"
)

# show best lag as a dash vertical line
fig.add_vline(x=int(best["lag"]), line_dash="dash")
fig.show()


In [None]:
# 5. scatter at best lag (positive lag = spend leads) (visual check)

# Shift returns to line up with spend at that lag
lag = int(best["lag"])
shifted = df.copy()
shifted["ret_shifted"] = shifted["ret_5d"].shift(-lag)

# Drop missing rows caused by shifting 
scatter_df = shifted.dropna()

# Scatter plot: does spend_yoy explain future returns?
px.scatter(
    scatter_df,
    x="spend_yoy",
    y="ret_shifted",
    title=f"Best lag = {lag} days (positive = spend leads)"
)


---
## Part 2: Deep Dive into Consumer Edge Data

Let's explore what's actually in this dataset and find actionable insights.

In [None]:
# LESSON 1: See what QSR brands we have in our cleaned data
# =========================================================
# The cleaned data only has QSR brands we filtered for.
# Let's see what we're working with.

print("Brands in our cleaned dataset:")
print("-" * 40)
for b in sorted(spend["brand"].unique()):
    total_spend = spend[spend["brand"] == b]["spend"].sum()
    print(f"{b:30} ${total_spend/1e9:.2f}B total spend")

In [None]:
# LESSON 2: Spending Trends Over Time
# ====================================
# KEY INSIGHT: Look for inflection points - when did spending change direction?
# These often correlate with earnings surprises.

# Aggregate all QSR spend by week (daily is too noisy)
spend["week"] = spend["date"].dt.to_period("W").dt.start_time

weekly = spend.groupby(["brand", "week"]).agg({
    "spend": "sum",
    "transactions": "sum",
    "avg_ticket_size": "mean"
}).reset_index()

# Calculate week-over-week growth
weekly["spend_wow"] = weekly.groupby("brand")["spend"].pct_change()

# Plot total QSR spend over time
total_weekly = spend.groupby("week")["spend"].sum().reset_index()

fig = px.line(
    total_weekly, 
    x="week", 
    y="spend",
    title="Total QSR Spend Over Time (All Brands Combined)",
    labels={"spend": "Weekly Spend ($)", "week": "Week"}
)
fig.update_layout(yaxis_tickformat="$,.0f")
fig.show()

In [None]:
# LESSON 3: Compare Brands Head-to-Head
# ======================================
# KEY INSIGHT: Relative performance matters! If McDonald's is up but Chipotle is up MORE,
# that's a signal about consumer preferences shifting.

# Normalize each brand's spend to start at 100 (indexed)
# This lets us compare brands of different sizes on the same scale

def normalize_series(group):
    """Normalize to 100 at start date"""
    group = group.sort_values("week")
    first_val = group["spend"].iloc[0]
    group["spend_indexed"] = (group["spend"] / first_val) * 100
    return group

weekly_indexed = weekly.groupby("brand").apply(normalize_series).reset_index(drop=True)

# Filter to main brands we care about
main_brands = ["MCDONALD'S", "CHIPOTLE MEXICAN", "STARBUCKS (MERCHANT)", "BURGER KING", "TACO BELL"]
main_data = weekly_indexed[weekly_indexed["brand"].isin(main_brands)]

fig = px.line(
    main_data,
    x="week",
    y="spend_indexed",
    color="brand",
    title="QSR Brand Performance (Indexed to 100)",
    labels={"spend_indexed": "Spend Index (100 = start)", "week": "Week"}
)
fig.show()

# INTERPRETATION: Brands above 100 are growing, below 100 are shrinking
# Steeper slopes = faster growth/decline

In [None]:
# LESSON 4: Traffic vs. Ticket Size Decomposition
# ================================================
# KEY INSIGHT: Revenue = Traffic Ã— Ticket Size
# Understanding WHICH driver is moving tells you about pricing power vs demand.
#
# - Traffic UP, Ticket flat = More customers (demand increasing)
# - Traffic flat, Ticket UP = Pricing power (customers paying more)
# - Traffic DOWN, Ticket UP = Fewer customers but higher value (could be concerning)

brand_to_analyze = "MCDONALD'S"

brand_weekly = weekly[weekly["brand"] == brand_to_analyze].copy()
brand_weekly = brand_weekly.sort_values("week")

# Calculate YoY changes (52 weeks ago = 1 year)
brand_weekly["transactions_yoy"] = brand_weekly["transactions"].pct_change(52) * 100
brand_weekly["ticket_yoy"] = brand_weekly["avg_ticket_size"].pct_change(52) * 100
brand_weekly["spend_yoy"] = brand_weekly["spend"].pct_change(52) * 100

# Recent data only (last 2 years for cleaner chart)
recent = brand_weekly[brand_weekly["week"] >= "2023-01-01"]

fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    subplot_titles=("YoY Spend Change (%)", "Decomposition: Traffic vs Ticket"))

# Top: Total spend YoY
fig.add_trace(
    go.Scatter(x=recent["week"], y=recent["spend_yoy"], name="Spend YoY %", 
               line=dict(color="blue", width=2)),
    row=1, col=1
)
fig.add_hline(y=0, line_dash="dash", line_color="gray", row=1, col=1)

# Bottom: Traffic and Ticket decomposition
fig.add_trace(
    go.Scatter(x=recent["week"], y=recent["transactions_yoy"], name="Traffic YoY %",
               line=dict(color="green", width=2)),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=recent["week"], y=recent["ticket_yoy"], name="Ticket YoY %",
               line=dict(color="orange", width=2)),
    row=2, col=1
)
fig.add_hline(y=0, line_dash="dash", line_color="gray", row=2, col=1)

fig.update_layout(height=600, title_text=f"{brand_to_analyze}: Revenue Decomposition")
fig.show()

# Look at latest values
latest = recent.iloc[-1]
print(f"\nðŸ“Š Latest Week ({latest['week'].strftime('%Y-%m-%d')}):")
print(f"   Spend YoY:       {latest['spend_yoy']:+.1f}%")
print(f"   Traffic YoY:     {latest['transactions_yoy']:+.1f}%")  
print(f"   Ticket Size YoY: {latest['ticket_yoy']:+.1f}%")

In [None]:
# LESSON 5: Seasonality Analysis
# ===============================
# KEY INSIGHT: QSR has strong day-of-week and month patterns.
# Knowing "normal" seasonality helps you spot ABNORMAL behavior.

spend["day_of_week"] = spend["date"].dt.day_name()
spend["month"] = spend["date"].dt.month_name()

# Day of week pattern
dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dow_spend = spend.groupby("day_of_week")["spend"].mean().reindex(dow_order)

fig = px.bar(
    x=dow_order, 
    y=dow_spend.values,
    title="Average Daily Spend by Day of Week (All QSR)",
    labels={"x": "Day", "y": "Average Spend ($)"}
)
fig.update_layout(yaxis_tickformat="$,.0f")
fig.show()

print("ðŸ’¡ INSIGHT: Weekends typically have higher spend. If a brand's weekend")
print("   spend drops relative to weekdays, that could signal trouble.")

In [24]:
# LESSON 6: Brand Performance Heatmap
# ====================================
# Create a summary view: How is each brand doing month-over-month?

import numpy as np

# Calculate monthly spend by brand
spend["year_month"] = spend["date"].dt.to_period("M").astype(str)  # Convert to string!
monthly_brand = spend.groupby(["brand", "year_month"])["spend"].sum().reset_index()

# Calculate MoM growth
monthly_brand["spend_mom"] = monthly_brand.groupby("brand")["spend"].pct_change() * 100

# Pivot for heatmap (brands as rows, months as columns)
# Filter to recent months
recent_months = monthly_brand[monthly_brand["year_month"] >= "2024-01"]
pivot = recent_months.pivot(index="brand", columns="year_month", values="spend_mom")

# Plot heatmap
fig = px.imshow(
    pivot,
    labels=dict(x="Month", y="Brand", color="MoM %"),
    title="Month-over-Month Spend Growth by Brand (%)",
    color_continuous_scale="RdYlGn",  # Red = bad, Green = good
    color_continuous_midpoint=0,
    aspect="auto"
)
fig.update_layout(height=400)
fig.show()

print("ðŸ’¡ HOW TO READ: Green = growing, Red = shrinking")
print("   Look for patterns - is one brand consistently red while others are green?")

ðŸ’¡ HOW TO READ: Green = growing, Red = shrinking
   Look for patterns - is one brand consistently red while others are green?


In [26]:
# LESSON 7: Generate Trade Signals
# ==================================
# This is what hedge funds actually do: turn data into actionable signals.

def generate_signal(brand_name, lookback_days=30):
    """
    Generate a simple trade signal based on recent spend trends.
    
    Signal logic:
    - Compare last 30 days avg spend vs prior 30 days
    - Compare to same period last year (YoY)
    - Return BUY/SELL/HOLD recommendation
    """
    brand_data = spend[spend["brand"] == brand_name].copy()
    brand_data = brand_data.sort_values("date")
    
    if len(brand_data) < 400:  # Need enough history
        return {"signal": "INSUFFICIENT DATA", "brand": brand_name}
    
    latest_date = brand_data["date"].max()
    
    # Recent periods
    last_30 = brand_data[brand_data["date"] > latest_date - pd.Timedelta(days=30)]
    prior_30 = brand_data[(brand_data["date"] > latest_date - pd.Timedelta(days=60)) & 
                          (brand_data["date"] <= latest_date - pd.Timedelta(days=30))]
    
    # Year ago
    year_ago_start = latest_date - pd.Timedelta(days=395)
    year_ago_end = latest_date - pd.Timedelta(days=365)
    year_ago = brand_data[(brand_data["date"] > year_ago_start) & 
                          (brand_data["date"] <= year_ago_end)]
    
    # Calculate metrics
    recent_avg = last_30["spend"].mean()
    prior_avg = prior_30["spend"].mean()
    yoy_avg = year_ago["spend"].mean() if len(year_ago) > 0 else recent_avg
    
    mom_change = (recent_avg / prior_avg - 1) * 100
    yoy_change = (recent_avg / yoy_avg - 1) * 100
    
    # Simple signal logic
    if yoy_change > 5 and mom_change > 0:
        signal = "ðŸŸ¢ BULLISH"
    elif yoy_change < -5 and mom_change < 0:
        signal = "ðŸ”´ BEARISH"
    else:
        signal = "ðŸŸ¡ NEUTRAL"
    
    return {
        "brand": brand_name,
        "signal": signal,
        "mom_change": round(mom_change, 1),
        "yoy_change": round(yoy_change, 1),
        "recent_daily_spend": f"${recent_avg:,.0f}",
        "as_of": latest_date.strftime("%Y-%m-%d")
    }

# Generate signals for all main brands
print("=" * 60)
print("ðŸ“ˆ TRADE SIGNALS (Based on Consumer Edge Spend Data)")
print("=" * 60)

for brand in main_brands:
    if brand in spend["brand"].unique():
        sig = generate_signal(brand)
        print(f"\n{sig['brand']}")
        print(f"   Signal: {sig['signal']}")
        print(f"   MoM Change: {sig['mom_change']:+.1f}%")
        print(f"   YoY Change: {sig['yoy_change']:+.1f}%")
        print(f"   Avg Daily Spend: {sig['recent_daily_spend']}")

ðŸ“ˆ TRADE SIGNALS (Based on Consumer Edge Spend Data)

MCDONALD'S
   Signal: ðŸŸ¡ NEUTRAL
   MoM Change: -2.6%
   YoY Change: +0.4%
   Avg Daily Spend: $9,852,347

CHIPOTLE MEXICAN
   Signal: ðŸŸ¡ NEUTRAL
   MoM Change: -0.9%
   YoY Change: +0.6%
   Avg Daily Spend: $2,576,818

STARBUCKS (MERCHANT)
   Signal: ðŸŸ¡ NEUTRAL
   MoM Change: +4.9%
   YoY Change: +3.4%
   Avg Daily Spend: $3,035,763

BURGER KING
   Signal: ðŸŸ¡ NEUTRAL
   MoM Change: -1.3%
   YoY Change: +1.1%
   Avg Daily Spend: $1,788,077

TACO BELL
   Signal: ðŸŸ¢ BULLISH
   MoM Change: +1.3%
   YoY Change: +7.9%
   Avg Daily Spend: $3,084,047


---
## Summary: What You've Learned

### Key Analysis Techniques:
1. **Lead/Lag Correlation** - Does spend data predict stock returns? (Lesson in Part 1)
2. **Indexed Comparisons** - Normalize to 100 to compare brands of different sizes
3. **Traffic vs Ticket Decomposition** - Understand *why* revenue is moving
4. **Seasonality** - Know what's "normal" to spot abnormal behavior
5. **Heatmaps** - Quick visual scan for winners/losers
6. **Signal Generation** - Turn data into actionable recommendations

### What Makes a Good Trade Signal:
- **YoY > +5% AND MoM positive** = Bullish (acceleration)
- **YoY < -5% AND MoM negative** = Bearish (deceleration)  
- **Ticket up, Traffic down** = Watch closely (pricing power but demand concern)

### Next Steps:
- [ ] Run this analysis weekly before earnings
- [ ] Add Advan foot traffic when available
- [ ] Backtest signals against actual earnings surprises
- [ ] Build alerts when signals change