# 01 Synthetic Data Generator
This notebook creates realistic synthetic campaign data for the **OnePlan — AI Media Mix Optimizer** project.
It produces:
- `data/raw/campaign_daily.csv`
- `data/raw/overlap_matrix.csv`

In [14]:
# 01 Synthetic Data Generator 
# Produces:
#   - ../data/raw/campaign_daily.csv
#   - ../data/raw/overlap_matrix.csv
#
# Notes:
# - Deterministic seeds for reproducibility
# - Channel-specific CPM, reach curve, CTR, and conversions response (a,b)
# - Light seasonality + weekend uplift
# - Schema stable for Parts 2–5

import numpy as np
import pandas as pd
from datetime import date, timedelta
from pathlib import Path

np.random.seed(42)

In [15]:
# Output paths
RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Channels & time horizon
CHANNELS = ["LinearTV", "Streaming", "YouTube", "Display", "Social"]
DAYS = 365
START_DATE = date(2024, 1, 1)

# Channel realism profile
# cpm = cost per 1,000 impressions
# A,k = reach curve parameters
# ctr = average CTR baseline
# a,b = conversions response params for diminishing returns
# noise = Gaussian noise added to conversions
CHANNEL_PARAMS = {
    "LinearTV":  {"cpm": 32, "A": 0.65, "k": 5e-8,  "ctr": 0.010, "a": 0.75, "b": 0.04, "noise": 8.0},
    "Streaming": {"cpm": 20, "A": 0.80, "k": 7e-8,  "ctr": 0.017, "a": 0.95, "b": 0.06, "noise": 6.0},
    "YouTube":   {"cpm": 12, "A": 0.88, "k": 1.0e-7,"ctr": 0.025, "a": 1.10, "b": 0.09, "noise": 5.5},
    "Display":   {"cpm":  9, "A": 0.83, "k": 1.3e-7,"ctr": 0.020, "a": 0.90, "b": 0.08, "noise": 4.5},
    "Social":    {"cpm":  6, "A": 0.95, "k": 1.5e-7,"ctr": 0.033, "a": 1.20, "b": 0.11, "noise": 5.5},
}

# Channel mean daily spend (controls scale of synthetic data)
MEAN_SPEND = {"LinearTV": 60000, "Streaming": 40000, "YouTube": 25000, "Display": 15000, "Social": 12000}

# Target audience size used to express reach in "people"
AUDIENCE_SIZE = 1_000_000

In [16]:
def month_seasonality(dt):
    # gentle uplift during summer and holidays
    m = dt.month
    table = {1:0.98, 2:0.99, 3:1.00, 4:1.02, 5:1.03, 6:1.05,
             7:1.06, 8:1.04, 9:1.02, 10:1.01, 11:1.03, 12:1.06}
    return table[m]

def weekend_multiplier(dow, channel):
    # slight weekend boost for Social/YouTube; slight dip for LinearTV
    if dow >= 5:  # Sat/Sun
        if channel in ["Social", "YouTube"]:
            return 1.06
        elif channel == "LinearTV":
            return 0.98
        else:
            return 1.02
    return 1.00

def reach_from_impressions(impressions, A, k):
    # Fraction of audience reached (0..A)
    return A * (1.0 - np.exp(-k * impressions))

In [17]:
rows = []
for d in range(DAYS):
    curr_date = START_DATE + timedelta(days=d)
    dow = curr_date.weekday()  # 0=Mon..6=Sun
    is_weekend = 1 if dow >= 5 else 0
    m_mult = month_seasonality(curr_date)

    for ch in CHANNELS:
        p = CHANNEL_PARAMS[ch]
        # Spend with noise around mean
        spend = np.random.normal(MEAN_SPEND[ch], MEAN_SPEND[ch]*0.25)
        spend = float(max(0.0, spend))

        # CPM & impressions
        cpm = float(p["cpm"])
        impressions = (spend / cpm) * 1000.0

        # CTR with seasonality + weekend tweak
        ctr = np.random.normal(p["ctr"], p["ctr"]*0.10)
        ctr *= m_mult * weekend_multiplier(dow, ch)
        ctr = float(np.clip(ctr, 0.005, 0.06))
        clicks = float(impressions * ctr)

        # Conversions with diminishing returns + noise + seasonality/day effect
        a, b = float(p["a"]), float(p["b"])
        base_conv = a * np.log1p(spend) + b * np.sqrt(spend)
        day_mult = (1.0 if dow < 5 else 1.03)
        conv = base_conv * m_mult * day_mult + np.random.normal(0.0, p["noise"])
        conversions = float(max(0.0, conv))

        # Estimated reach (people) via channel-specific A,k
        A, k = float(p["A"]), float(p["k"])
        reach_pct = reach_from_impressions(impressions, A=A, k=k)  # 0..A
        est_reach_people = reach_pct * AUDIENCE_SIZE

        rows.append([
            curr_date, ch, spend, cpm, impressions,
            clicks, conversions, est_reach_people, dow, is_weekend
        ])

df = pd.DataFrame(rows, columns=[
    "date","channel","spend","cpm","impressions",
    "clicks","conversions","est_reach","dow","is_weekend"
])

# Types and sorting
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["channel","date"]).reset_index(drop=True)
df.head()

Unnamed: 0,date,channel,spend,cpm,impressions,clicks,conversions,est_reach,dow,is_weekend
0,2024-01-01,Display,17034.600163,9.0,1892733.0,35378.406488,16.730102,181040.464932,0,0
1,2024-01-02,Display,12958.564783,9.0,1439841.0,28533.907664,12.097425,141685.103445,1,0
2,2024-01-03,Display,15738.229635,9.0,1748692.0,36805.4142,19.130172,168773.980452,2,0
3,2024-01-04,Display,18866.248209,9.0,2096250.0,44912.799799,15.675562,197984.937675,3,0
4,2024-01-05,Display,12580.80092,9.0,1397867.0,28388.347169,24.040921,137918.98613,4,0


In [18]:
# Save campaign
out_campaign = RAW_DIR / "campaign_daily.csv"
df.to_csv(out_campaign, index=False)
print("Saved:", out_campaign)

# Quick sanity
display(df.groupby("channel")[["spend","cpm","impressions","clicks","conversions","est_reach"]]
          .mean().round(2))

Saved: ../data/raw/campaign_daily.csv


Unnamed: 0_level_0,spend,cpm,impressions,clicks,conversions,est_reach
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Display,15275.42,9.0,1697269.3,35138.36,19.42,163372.4
LinearTV,58973.81,32.0,1842931.53,18545.04,18.51,57054.45
Social,11970.68,6.0,1995113.02,68263.52,24.44,243743.79
Streaming,40018.93,20.0,2000946.73,34829.8,22.91,104191.62
YouTube,24662.13,12.0,2055177.59,53408.5,25.84,162490.91


In [19]:
# Higher overlaps among Streaming–YouTube–Social; lower with LinearTV
pairs = []
for i, ci in enumerate(CHANNELS):
    for j, cj in enumerate(CHANNELS):
        if i < j:
            base = 0.10
            pair = {ci, cj}
            if {"YouTube","Social"} <= pair:        base = 0.30
            elif {"Streaming","YouTube"} <= pair:   base = 0.25
            elif {"Streaming","Social"} <= pair:    base = 0.22
            elif {"LinearTV","Streaming"} <= pair:  base = 0.15
            elif {"LinearTV","YouTube"} <= pair:    base = 0.12
            elif {"LinearTV","Social"} <= pair:     base = 0.10
            ov = float(np.clip(np.random.normal(base, 0.03), 0.05, 0.40))
            pairs.append([ci, cj, round(ov, 3)])

overlap_df = pd.DataFrame(pairs, columns=["ch_i","ch_j","overlap_rate"])
out_overlap = RAW_DIR / "overlap_matrix.csv"
overlap_df.to_csv(out_overlap, index=False)
print("Saved:", out_overlap)
overlap_df.head()

Saved: ../data/raw/overlap_matrix.csv


Unnamed: 0,ch_i,ch_j,overlap_rate
0,LinearTV,Streaming,0.199
1,LinearTV,YouTube,0.115
2,LinearTV,Display,0.099
3,LinearTV,Social,0.086
4,Streaming,YouTube,0.283


In [20]:
expected_cols = ["date","channel","spend","cpm","impressions","clicks","conversions","est_reach","dow","is_weekend"]
assert list(df.columns) == expected_cols, f"Unexpected columns: {list(df.columns)}"
assert df["spend"].ge(0).all(), "Negative spend found"
assert df["conversions"].ge(0).all(), "Negative conversions found"
assert set(overlap_df.columns) == {"ch_i","ch_j","overlap_rate"}, "Overlap schema mismatch"

print("Schema checks passed. Rows:", len(df), "| Overlap pairs:", len(overlap_df))

Schema checks passed. Rows: 1825 | Overlap pairs: 10
