# %% [markdown]
# # Cell 0 – Notebook preamble  
# Autogenerate **Linktree Staging→Curated cleaner**.  
# *Guided by `LLM_cleaner_guidelines.md` (Change-ID 2025-06-12-linktree_cleaner_guidelines).*


In [1]:
# %%  
# Cell 1 – Imports & constants
import os, json, argparse
from datetime import datetime
from pathlib import Path
import pandas as pd

PLATFORM      = "linktree"
PROJECT_ROOT  = Path(os.environ["PROJECT_ROOT"])

STAGING_DIR   = PROJECT_ROOT / "staging" / PLATFORM   # holds CSV from previous step
CURATED_DIR   = PROJECT_ROOT / "curated" / PLATFORM

for _d in (STAGING_DIR, CURATED_DIR):
    _d.mkdir(parents=True, exist_ok=True)

print(f"[INFO] Staging dir:  {STAGING_DIR}")
print(f"[INFO] Curated dir:  {CURATED_DIR}")


[INFO] Staging dir:  C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\staging\linktree
[INFO] Curated dir:  C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\curated\linktree


In [2]:
# %%  
# Cell 2 – Helper: business-rule curation
def curate_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply Linktree-specific business rules:
    1. Deduplicate on 'date'
    2. Drop '__typename'
    3. Ensure 'clickThroughRate' is correct (re-calc if missing/NaN)
    """
    if "__typename" in df.columns:
        df = df.drop(columns="__typename")

    # Re-calculate CTR if null or differs
    with pd.option_context('mode.use_inf_as_na', True):
        ctr = df["totalClicks"] / df["totalViews"].replace({0: pd.NA})
        df["clickThroughRate"] = df["clickThroughRate"].fillna(ctr)
        df["clickThroughRate"] = df["clickThroughRate"].round(4)

    # Keep the latest row per date (there should be only one)
    df = (
        df.sort_values("date")
          .drop_duplicates(subset=["date"], keep="last")
          .reset_index(drop=True)
    )
    return df


In [3]:
# %%  
# Cell 3 – Load & merge staging CSV files
def load_staging(files: list[Path]) -> pd.DataFrame:
    frames = []
    for fp in files:
        try:
            frames.append(pd.read_csv(fp, parse_dates=["date"]))
        except Exception as e:
            print(f"[ERROR] {fp.name}: {e}")
    if not frames:
        raise RuntimeError("No staging CSV files read successfully")
    return pd.concat(frames, ignore_index=True)


In [5]:
# %%  
# Cell 4 – Cleaner script template (writes CSV + Parquet)  ★ FIXED ★
CLEANER_TEMPLATE = f'''"""
linktree_staging2curated.py
Staging → Curated cleaner for Linktree analytics.

Guided by `LLM_cleaner_guidelines.md`.
Reads **CSV** from staging, writes **CSV and Parquet** to curated.
"""

import os, argparse
from datetime import datetime
from pathlib import Path
import pandas as pd

PLATFORM      = "linktree"
PROJECT_ROOT  = Path(os.environ["PROJECT_ROOT"])
STAGING_DIR   = PROJECT_ROOT / "staging" / PLATFORM
CURATED_DIR   = PROJECT_ROOT / "curated" / PLATFORM

for _d in (STAGING_DIR, CURATED_DIR):
    _d.mkdir(parents=True, exist_ok=True)

def curate_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    if "__typename" in df.columns:
        df = df.drop(columns="__typename")

    # Re-calculate CTR if missing / NaN
    with pd.option_context("mode.use_inf_as_na", True):
        ctr = df["totalClicks"] / df["totalViews"].replace({{0: pd.NA}})
        df["clickThroughRate"] = df["clickThroughRate"].fillna(ctr)
        df["clickThroughRate"] = df["clickThroughRate"].round(4)

    # Keep last record per date
    df = (
        df.sort_values("date")
          .drop_duplicates(subset=["date"], keep="last")
          .reset_index(drop=True)
    )
    return df

def load_staging(files: list[Path]) -> pd.DataFrame:
    frames = []
    for fp in files:
        try:
            frames.append(pd.read_csv(fp, parse_dates=["date"]))
        except Exception as e:
            print(f"[ERROR] {{fp.name}}: {{e}}")
    if not frames:
        raise RuntimeError("No staging CSV files read")
    return pd.concat(frames, ignore_index=True)

def main():
    parser = argparse.ArgumentParser(description="Linktree Staging→Curated cleaner")
    parser.add_argument("--input", help="Specific staging CSV file", default=None)
    args = parser.parse_args()

    files = [Path(args.input)] if args.input else sorted(STAGING_DIR.glob("*.csv"))
    if not files:
        raise RuntimeError(f"No staging CSV files found in {{STAGING_DIR}}")

    df_raw = load_staging(files)
    df_cur = curate_dataframe(df_raw)

    ts   = datetime.now().strftime("%Y%m%d_%H%M%S")
    stem = f"linktree_analytics_curated_{{ts}}"

    # CSV
    csv_path = CURATED_DIR / f"{{stem}}.csv"
    df_cur.to_csv(csv_path, index=False, encoding="utf-8")
    print(f"[CURATED] CSV → {{csv_path.name}}  ({{len(df_cur)}} rows)")

    # Parquet (optional)
    try:
        pq_path = CURATED_DIR / f"{{stem}}.parquet"
        df_cur.to_parquet(pq_path, index=False)
        print(f"[CURATED] Parquet → {{pq_path.name}}")
    except Exception as e:
        print(f"[ERROR] Parquet write failed: {{e}} (CSV still produced)")

if __name__ == "__main__":
    main()
'''


In [6]:
# %%  
# Cell 5 – Write the cleaner script to disk
cleaner_dir = PROJECT_ROOT / "src" / "linktree" / "cleaners"
cleaner_dir.mkdir(parents=True, exist_ok=True)

outfile = cleaner_dir / "linktree_staging2curated.py"
outfile.write_text(CLEANER_TEMPLATE, encoding="utf-8")

print(f"[INFO] Cleaner written to {outfile.relative_to(PROJECT_ROOT)}")


[INFO] Cleaner written to src\linktree\cleaners\linktree_staging2curated.py
