In [14]:
# ─── Cell 1: Imports & Environment Setup ────────────────────────────────────────
import os, json
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from IPython.display import display

load_dotenv()
PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT"))
RAW      = PROJECT_ROOT / os.getenv("RAW_ZONE",     "raw")
STAGING  = PROJECT_ROOT / os.getenv("STAGING_ZONE", "staging")


In [15]:
# ─── Cell 2: Union ALL raw dumps into consolidated DataFrames ──────────────────
raw_meta = RAW / "metaads"
folders  = [d for d in raw_meta.iterdir() if d.is_dir()]
assert folders, f"No raw dumps found in {raw_meta}"

def stack(folders, fname):
    """Load a JSON file type from every folder, concat, and deduplicate."""
    dfs = [pd.read_json(f / fname, dtype_backend="pyarrow") for f in folders if (f / fname).exists()]
    if not dfs:
        return pd.DataFrame()

    df = pd.concat(dfs, ignore_index=True)

    if fname != "insights.json":                       # ads / adsets / campaigns
        return df.drop_duplicates(subset="id")
    # insights → dedupe on minimal primary key
    pk = [c for c in ["date_start","ad_id","adset_id","campaign_id"] if c in df.columns]
    return df.drop_duplicates(subset=pk) if pk else df.drop_duplicates()

ads       = stack(folders, "ads.json")
adsets    = stack(folders, "adsets.json")
campaigns = stack(folders, "campaigns.json")
insights  = stack(folders, "insights.json")

for col in ("spend","impressions","clicks","reach","cpc","ctr","frequency"):
    if col in insights.columns:
        insights[col] = pd.to_numeric(insights[col], errors="coerce")

print(f"Rows → campaigns {len(campaigns)}, adsets {len(adsets)}, ads {len(ads)}, insights {len(insights)}")


Rows → campaigns 13, adsets 31, ads 54, insights 31


In [16]:
# ─── Cell 3: Flatten ads.json (include campaign_id & adset_id) ─────────────────
import json

ads_flat = ads.copy()
ads_flat["creative_id"]    = ads_flat["creative"].apply(
    lambda c: c.get("id") if isinstance(c, dict) else None)
ads_flat["tracking_specs"] = ads_flat["tracking_specs"].apply(json.dumps)

ads_flat = ads_flat[[
    "id","campaign_id","adset_id","name","status","effective_status",
    "created_time","updated_time","creative_id","tracking_specs"
]].rename(columns={
    "id":   "ad_id",
    "name": "ad_name"
})

print("ads_flat cols:", ads_flat.columns.tolist())


ads_flat cols: ['ad_id', 'campaign_id', 'adset_id', 'ad_name', 'status', 'effective_status', 'created_time', 'updated_time', 'creative_id', 'tracking_specs']


In [12]:
# ─── Cell 4: Prepare campaigns_ & adsets_ for merge ────────────────────────────
campaigns_ = campaigns.rename(columns={
    "id":        "campaign_id",
    "name":      "campaign_name",
    "status":    "campaign_status",
    "objective": "campaign_objective"
})

adsets_ = (adsets.rename(columns={
    "id":              "adset_id",
    "name":            "adset_name",
    "status":          "adset_status",
    "daily_budget":    "adset_daily_budget",
    "lifetime_budget": "adset_lifetime_budget"
})
.drop(columns=["campaign_id"], errors="ignore"))

print("campaigns_ cols:", campaigns_.columns.tolist())
print("adsets_ cols:",    adsets_.columns.tolist())


campaigns_ cols: ['campaign_id', 'campaign_name', 'campaign_status', 'campaign_objective', 'start_time', 'created_time', 'updated_time']
adsets_ cols: ['adset_id', 'adset_name', 'adset_status', 'adset_daily_budget', 'adset_lifetime_budget', 'bid_strategy', 'targeting', 'optimization_goal', 'start_time', 'pacing_type', 'created_time', 'updated_time']


In [13]:
# ─── Cell 5: Merge to AD-level tidy & write to STAGING ─────────────────────────
tidy = (insights
        .merge(adsets_,    on="adset_id",    how="left")
        .merge(campaigns_, on="campaign_id", how="left")
        .merge(ads_flat,   on=["campaign_id","adset_id"], how="left")
        .loc[:, lambda d: ~d.columns.duplicated()])

STAGING.mkdir(parents=True, exist_ok=True)
out_csv = STAGING / "tidy_metaads.csv"
tidy.to_csv(out_csv, index=False)

print(f"✅ tidy_metaads → {out_csv}  (rows: {len(tidy)})")
display(tidy.head(5))


✅ tidy_metaads → C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\staging\tidy_metaads.csv  (rows: 54)


Unnamed: 0,campaign_id,campaign_name_x,adset_id,adset_name_x,spend,impressions,clicks,cpc,ctr,reach,...,created_time_y,updated_time_y,ad_id,ad_name,status,effective_status,created_time,updated_time,creative_id,tracking_specs
0,120224907266750075,PIG1987 - THE STATE OF THE WORLD - BROAD,120224907266740075,THE STATE OF THE WORLD - AD 1,3.41,1321,8,0.42625,0.605602,1299,...,2025-05-23T06:57:55-0700,2025-05-24T17:04:26-0700,120224907266800075,AD1,ACTIVE,ADSET_PAUSED,2025-05-23T06:58:13-0700,2025-05-23T07:01:41-0700,24188788294059211,"[{""action.type"": [""offsite_conversion""], ""fb_p..."
1,120224907266750075,PIG1987 - THE STATE OF THE WORLD - BROAD,120224907266810075,THE STATE OF THE WORLD - AD 3,29.1,17341,226,0.128761,1.30327,14102,...,2025-05-23T06:57:55-0700,2025-05-24T17:04:26-0700,120224907266760075,AD3,ACTIVE,ACTIVE,2025-05-23T06:58:02-0700,2025-05-23T07:02:14-0700,566770032783189,"[{""action.type"": [""offsite_conversion""], ""fb_p..."
2,120224907266750075,PIG1987 - THE STATE OF THE WORLD - BROAD,120224907266820075,THE STATE OF THE WORLD - AD 4,8.33,4238,51,0.163333,1.203398,4082,...,2025-05-23T06:57:55-0700,2025-05-24T17:04:26-0700,120224907266770075,AD4,ACTIVE,ADSET_PAUSED,2025-05-23T06:58:17-0700,2025-05-23T07:05:35-0700,2178986529195170,"[{""action.type"": [""offsite_conversion""], ""fb_p..."
3,120224907266750075,PIG1987 - THE STATE OF THE WORLD - BROAD,120224907266830075,THE STATE OF THE WORLD - AD 2,3.14,1380,13,0.241538,0.942029,1331,...,2025-05-23T06:57:55-0700,2025-05-24T17:04:26-0700,120224907266780075,AD2,ACTIVE,ADSET_PAUSED,2025-05-23T06:58:07-0700,2025-05-23T07:02:43-0700,2548260048843291,"[{""action.type"": [""offsite_conversion""], ""fb_p..."
4,120224907266750075,PIG1987 - THE STATE OF THE WORLD - BROAD,120224907266840075,THE STATE OF THE WORLD - AD 5,6.15,3822,27,0.227778,0.706436,3572,...,2025-05-23T06:57:55-0700,2025-05-24T17:04:26-0700,120224907266790075,AD5,ACTIVE,ADSET_PAUSED,2025-05-23T06:57:57-0700,2025-05-23T07:01:17-0700,1732602354028075,"[{""action.type"": [""offsite_conversion""], ""fb_p..."
