# 01 Synthetic Data Generator
This notebook creates realistic synthetic campaign data for the **OnePlan — AI Media Mix Optimizer** project.
It produces:
- `data/raw/campaign_daily.csv`
- `data/raw/overlap_matrix.csv`

In [4]:
import pandas as pd
import numpy as np
from datetime import date, timedelta
import random

# ----- Config -----
np.random.seed(42)
channels = ["LinearTV", "Streaming", "YouTube", "Display", "Social"]
days = 365

# Base parameters per channel
mean_spend = {
    "LinearTV": 60000,
    "Streaming": 40000,
    "YouTube": 25000,
    "Display": 15000,
    "Social": 12000
}
cpm = {
    "LinearTV": 25,
    "Streaming": 18,
    "YouTube": 12,
    "Display": 8,
    "Social": 6
}

rows = []
start = date(2024, 1, 1)

for d in range(days):
    curr_date = start + timedelta(days=d)
    dow = curr_date.weekday()
    is_weekend = 1 if dow >= 5 else 0

    for ch in channels:
        spend = max(0, np.random.normal(mean_spend[ch], mean_spend[ch]*0.25))
        impressions = (spend / cpm[ch]) * 1000
        ctr = np.random.uniform(0.01, 0.05)
        clicks = impressions * ctr

        # Diminishing returns for conversions
        a, b = np.random.uniform(0.8, 1.2), np.random.uniform(0.05, 0.1)
        conversions = a * np.log1p(spend) + b * np.sqrt(spend) + np.random.normal(0, 5)

        est_reach = 1e5 * (1 - np.exp(-impressions / 5e6))  # saturating

        rows.append([curr_date, ch, spend, cpm[ch], impressions,
                     clicks, conversions, est_reach, dow, is_weekend])

df = pd.DataFrame(rows, columns=[
    "date", "channel", "spend", "cpm", "impressions",
    "clicks", "conversions", "est_reach", "dow", "is_weekend"
])

df.to_csv("../data/raw/campaign_daily.csv", index=False)
print("✅  Saved campaign_daily.csv", df.shape)
df.head()


✅  Saved campaign_daily.csv (1825, 10)


Unnamed: 0,date,channel,spend,cpm,impressions,clicks,conversions,est_reach,dow,is_weekend
0,2024-01-01,LinearTV,67450.712295,25,2698028.0,105977.905352,25.878285,41702.192373,0,0
1,2024-01-01,Streaming,42790.412922,18,2377245.0,91102.936029,34.04628,37839.412743,0,0
2,2024-01-01,YouTube,22065.785088,12,1838815.0,31761.856407,21.134986,30771.88258,0,0
3,2024-01-01,Display,5202.941202,8,650367.7,14079.916967,17.801543,12196.913312,0,0
4,2024-01-01,Social,9275.927773,6,1545988.0,43663.027844,8.892959,26596.428422,0,0


In [5]:
overlaps = []
for i, ch1 in enumerate(channels):
    for j, ch2 in enumerate(channels):
        if i < j:
            overlaps.append([ch1, ch2, round(np.random.uniform(0.05, 0.25), 3)])
overlap_df = pd.DataFrame(overlaps, columns=["ch_i", "ch_j", "overlap_rate"])
overlap_df.to_csv("../data/raw/overlap_matrix.csv", index=False)
print("✅  Saved overlap_matrix.csv", overlap_df.shape)
overlap_df.head()

✅  Saved overlap_matrix.csv (10, 3)


Unnamed: 0,ch_i,ch_j,overlap_rate
0,LinearTV,Streaming,0.1
1,LinearTV,YouTube,0.228
2,LinearTV,Display,0.194
3,LinearTV,Social,0.065
4,Streaming,YouTube,0.214
