# %% [markdown]
# # Cell 0 – Notebook preamble  
# Autogenerate **Linktree Raw→Staging cleaner (CSV output)**.  
# *Adapted from `LLM_cleaner_guidelines.md` per user override to use CSV.*


In [6]:
# %%  
# Cell 1 – Imports & constants
import os, json, argparse
from datetime import datetime
from pathlib import Path
import pandas as pd

PLATFORM      = "linktree"
PROJECT_ROOT  = Path(os.environ["PROJECT_ROOT"])

RAW_DIR       = PROJECT_ROOT / "raw"     / PLATFORM
STAGING_DIR   = PROJECT_ROOT / "staging" / PLATFORM   # will now hold CSV

for _dir in (RAW_DIR, STAGING_DIR):
    _dir.mkdir(parents=True, exist_ok=True)

print(f"[INFO] Raw dir:     {RAW_DIR}")
print(f"[INFO] Staging dir: {STAGING_DIR}")


[INFO] Raw dir:     C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\raw\linktree
[INFO] Staging dir: C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\staging\linktree


In [7]:
# %%  
# Cell 2 – Helper: convert one NDJSON record to a clean dict
METRIC_COLS = [
    "totalViews",
    "uniqueViews",
    "totalClicks",
    "uniqueClicks",
    "clickThroughRate",
]

def record_to_row(rec: dict) -> dict:
    """Validate & coerce types for one record."""
    if "date" not in rec:
        raise ValueError("Missing 'date' field")
    row = {"date": pd.to_datetime(rec["date"], errors="coerce")}
    for col in METRIC_COLS:
        row[col] = pd.to_numeric(rec.get(col), errors="coerce")
    return row


In [8]:
# %%  
# Cell 3 – Build a DataFrame from *.ndjson files
def build_dataframe(files: list[Path]) -> pd.DataFrame:
    rows = []
    for fp in files:
        try:
            with fp.open("r", encoding="utf-8") as f:
                for line in f:
                    rows.append(record_to_row(json.loads(line)))
        except Exception as e:
            print(f"[ERROR] {fp.name}: {e}")

    if not rows:
        raise RuntimeError("No valid rows extracted from raw NDJSON")

    df = pd.DataFrame(rows)
    df = df.dropna(subset=["date"])
    df = df.sort_values("date").drop_duplicates()
    df.reset_index(drop=True, inplace=True)
    return df


In [None]:
# %%  
# Cell 4 – CLI entry-point (as a string template)
CLEANER_TEMPLATE = f'''"""
linktree_landing2raw.py
Landing → Raw cleaner for Linktree data.

Guided by `LLM_cleaner_guidelines.md`.
"""

import os, json, argparse
from datetime import datetime
from pathlib import Path

PLATFORM = "linktree"
PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])

LANDING_DIR = PROJECT_ROOT / "landing" / PLATFORM
RAW_DIR     = PROJECT_ROOT / "raw"      / PLATFORM

for _dir in (LANDING_DIR, RAW_DIR):
    _dir.mkdir(parents=True, exist_ok=True)

def transform_response(payload: dict) -> list[dict]:
    """Flatten GraphQL `getAccountAnalytics` → `overview` → `timeseries`."""
    try:
        ts_rows = (
            payload["data"]
                   ["getAccountAnalytics"]
                   ["overview"]
                   ["timeseries"]
        )
    except (KeyError, TypeError):
        print("[ERROR] Unexpected GraphQL structure.")
        return []

    return [{{
        "date":              r.get("date"),
        "totalViews":        r.get("totalViews"),
        "uniqueViews":       r.get("uniqueViews"),
        "totalClicks":       r.get("totalClicks"),
        "uniqueClicks":      r.get("uniqueClicks"),
        "clickThroughRate":  r.get("clickThroughRate"),
        "__typename":        r.get("__typename")
    }} for r in ts_rows]

def process_file(in_path: Path) -> int:
    out_path = RAW_DIR / f"{{in_path.stem}}.ndjson"
    written  = 0
    try:
        with in_path.open("r", encoding="utf-8") as f:
            payload = json.load(f)
        rows = transform_response(payload)
        if not rows:
            return 0
        with out_path.open("w", encoding="utf-8") as out_f:
            for row in rows:
                json.dump(row, out_f, ensure_ascii=False)
                out_f.write("\\n")
                written += 1
        print(f"[RAW]  {{in_path.name}} → {{out_path.name}} ({{written}} rows)")
        return written
    except Exception as e:
        print(f"[ERROR] {{in_path.name}}: {{e}}")
        return 0

def main():
    parser = argparse.ArgumentParser(description="Linktree Landing→Raw cleaner")
    parser.add_argument(
        "--file", help="Process a single landing JSON file", default=None
    )
    args = parser.parse_args()

    files = (
        [Path(args.file)]
        if args.file else
        sorted(LANDING_DIR.glob("*.json"))
    )

    total_rows = 0
    for fp in files:
        total_rows += process_file(fp)

    if total_rows == 0:
        raise RuntimeError("No records processed; aborting.")

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[INFO] Completed run at {{timestamp}} – {{total_rows}} rows total.")

if __name__ == "__main__":
    main()
'''


In [None]:
# %%  
# Cell 5 – Write the cleaner script to disk
cleaner_dir = PROJECT_ROOT / "src" / "linktree" / "cleaners"
cleaner_dir.mkdir(parents=True, exist_ok=True)

outfile = cleaner_dir / "linktree_raw2staging.py"
outfile.write_text(CLEANER_TEMPLATE, encoding="utf-8")

print(f"[INFO] Cleaner written to {outfile.relative_to(PROJECT_ROOT)}")


[INFO] Cleaner written to src\linktree\cleaners\linktree_raw2staging.py
