In [1]:
# 📦 Cell 1 – Imports
# ------------------------------------------------------------
from pathlib import Path
import re, json, pandas as pd

In [2]:
# 📂 Cell 2 – Source & target paths
# ------------------------------------------------------------
distrokid_html = Path(r"C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\landing\distrokid\streams\streams_stats_20250522_085156.html")
apple_html     = Path(r"C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\landing\distrokid\streams\applemusic_stats_20250522_085156.html")

curated_dir    = Path(r"C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\curated")
curated_dir.mkdir(parents=True, exist_ok=True)

out_csv        = curated_dir / "daily_streams_distrokid.csv"
print("CSV will be saved to:", out_csv)


CSV will be saved to: C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\curated\daily_streams_distrokid.csv


In [3]:
# 🔎 Cell 3 – DistroKid daily extractor
def extract_distrokid_daily(html_path: Path) -> pd.DataFrame:
    text = html_path.read_text(encoding="utf-8", errors="ignore")
    m = re.search(r'"id"\s*:\s*"trend365day".+?"dataProvider"\s*:\s*\[([^\]]+)\]', 
                  text, flags=re.DOTALL)
    if not m:
        raise ValueError("trend365day chart not found in DistroKid HTML.")
    arr_text = "[" + m.group(1).strip() + "]"
    arr_text = re.sub(r',\s*\]', ']', arr_text)
    data     = json.loads(arr_text)
    df = pd.DataFrame(data)
    df.rename(columns={"category": "date", "column-1": "spotify_streams"}, inplace=True)
    df["date"] = pd.to_datetime(df["date"])
    return df[["date", "spotify_streams"]]


In [4]:
# 🍏 Cell 4 – Apple Music daily extractor
def extract_apple_daily(html_path: Path) -> pd.DataFrame:
    text = html_path.read_text(encoding="utf-8", errors="ignore")
    providers = []
    for m in re.finditer(r'"dataProvider"\s*:\s*\[([^\]]+)\]', text, re.DOTALL):
        array_txt = "[" + m.group(1) + "]"
        array_txt = re.sub(r',\s*\]', ']', array_txt)
        try:
            providers.append(json.loads(array_txt))
        except json.JSONDecodeError:
            continue
    if not providers:
        raise ValueError("No dataProvider arrays found in Apple Music HTML.")
    data = max(providers, key=len)           # assume longest = daily
    if len(data) < 50:
        raise ValueError("Daily data array looks too short; check HTML.")
    first      = data[0]
    date_key   = "field" if "field" in first else "category"
    value_key  = "value" if "value" in first else ("column-1" if "column-1" in first else list(first.keys())[1])
    df         = pd.DataFrame(data)
    df.rename(columns={date_key: "date", value_key: "apple_streams"}, inplace=True)
    df["date"] = pd.to_datetime(df["date"])
    return df[["date", "apple_streams"]]


In [5]:
# 🏗️ Cell 5 – Combine the two sources
dk_df    = extract_distrokid_daily(distrokid_html)
apple_df = extract_apple_daily(apple_html)

combined = (dk_df
            .merge(apple_df, on="date", how="outer")
            .sort_values("date")
            .fillna(0))

combined["spotify_streams"] = combined["spotify_streams"].astype(int)
combined["apple_streams"]   = combined["apple_streams"].astype(int)
combined["combined_streams"] = combined["spotify_streams"] + combined["apple_streams"]

combined.head()


Unnamed: 0,date,spotify_streams,apple_streams,combined_streams
0,2024-08-23,40,0,40
1,2024-08-24,17,0,17
2,2024-08-25,6,0,6
3,2024-08-26,40,0,40
4,2024-08-27,31,0,31


In [6]:
# 💾 Cell 6 – Write CSV & confirm
combined.to_csv(out_csv, index=False)
print(f"✅  Saved merged CSV to: {out_csv}")
print(f"Rows: {len(combined)}, Date range: {combined['date'].min().date()} → {combined['date'].max().date()}")

combined.tail()


✅  Saved merged CSV to: C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\curated\daily_streams_distrokid.csv
Rows: 272, Date range: 2024-08-23 → 2025-05-21


Unnamed: 0,date,spotify_streams,apple_streams,combined_streams
267,2025-05-17,2539,74,2613
268,2025-05-18,2891,49,2940
269,2025-05-19,2843,55,2898
270,2025-05-20,1795,59,1854
271,2025-05-21,0,13,13


In [8]:
# 🔍 Cell 7 – Post-validation check (absolute path)
# ------------------------------------------------------------
from pathlib import Path
import pandas as pd

csv_to_validate = Path(r"C:\Users\Earth\BEDROT PRODUCTIONS\BEDROT DATA LAKE\data_lake\curated\daily_streams_distrokid.csv")

df = pd.read_csv(csv_to_validate)

sum_spotify  = df["spotify_streams"].sum()
sum_apple    = df["apple_streams"].sum()
sum_combined = df["combined_streams"].sum()

print(f"Spotify total  : {sum_spotify:,}")
print(f"Apple total    : {sum_apple:,}")
print(f"Combined total : {sum_combined:,}")

if (sum_spotify + sum_apple) == sum_combined:
    print("\n✅  Validation passed — sums line up.")
else:
    print("\n❌  Validation FAILED — combined total mismatch.")


Spotify total  : 660,121
Apple total    : 19,874
Combined total : 679,995

✅  Validation passed — sums line up.
