In [None]:
# ==============================================================================
# CELL 1: Environment Setup & Data Loading (WITH ZIP EXTRACTION)
# ==============================================================================

import os
import glob
from pathlib import Path
from google.colab import drive

print("="*70)
print("CELL 1: Environment Setup & Data Loading")
print("="*70)

# Mount Drive
print("\n[1/5] Mounting Google Drive...")
drive.mount('/content/drive', force_remount=False)
print("✓ Drive mounted")

# Check if dataset needs extraction
print("\n[2/5] Checking dataset status...")

ZIP_PATH = "/content/drive/MyDrive/STA 160/dataset/aivideo-dataset.zip"
EXTRACT_TO = "/content"
ROOT_DIR = Path("/content/aivideo-dataset")

# Check if already extracted
if ROOT_DIR.exists() and any(ROOT_DIR.glob("**/*.npz")):
    print(f"✓ Dataset already extracted at: {ROOT_DIR}")
    skip_extraction = True
else:
    print(f"Dataset not found, will extract from: {ZIP_PATH}")
    skip_extraction = False

# Extract if needed
if not skip_extraction:
    print("\n[3/5] Extracting dataset...")

    if not Path(ZIP_PATH).exists():
        print(f"❌ ERROR: ZIP file not found at: {ZIP_PATH}")
        raise FileNotFoundError(f"ZIP not found: {ZIP_PATH}")

    print("  Installing 7z...")
    !apt-get -yq install p7zip-full > /dev/null 2>&1

    print("  Creating extraction directory...")
    !mkdir -p {EXTRACT_TO}/aivideo-dataset

    print("  Extracting features and META files...")
    print("  (This may take 2-3 minutes...)")

    # Extract
    !7z x "{ZIP_PATH}" -o{EXTRACT_TO} -y \
      -ir!*/META*.parquet \
      -ir!*/features_logmel_sr16k_v1/* \
      -ir!*/features_logmel_sr16k_v1_canonical/* \
      > /dev/null 2>&1

    print("  ✓ Extraction complete")
else:
    print("\n[3/5] Skipping extraction (already done)")

# Find features directory
print("\n[4/5] Locating features...")

# Search for features
feature_search = !find /content -maxdepth 3 -type d -name "features_logmel_sr16k_v1*" -print

if feature_search:
    # Take first result
    FEATURE_DIR = Path(feature_search[0])
    print(f"✓ Found features at: {FEATURE_DIR}")

    # Update ROOT_DIR to parent
    ROOT_DIR = FEATURE_DIR.parent
    print(f"✓ Root directory: {ROOT_DIR}")
else:
    print("❌ Features directory not found after extraction!")
    print("\nSearching for any .npz files...")
    npz_files = !find /content -name "*.npz" -print 2>/dev/null | head -5

    if npz_files and npz_files[0]:
        print(f"Found .npz files at: {npz_files[0]}")
        FEATURE_DIR = Path(npz_files[0]).parent
        ROOT_DIR = FEATURE_DIR.parent
        print(f"✓ Using FEATURE_DIR: {FEATURE_DIR}")
    else:
        print("\n❌ ERROR: No .npz files found!")
        print("\nShowing /content structure:")
        !ls -la /content/

        if Path("/content/aivideo-dataset").exists():
            print("\nShowing /content/aivideo-dataset:")
            !ls -la /content/aivideo-dataset/

        raise FileNotFoundError("No features found after extraction")

# Count files
print("\n[5/5] Counting files...")
all_npz_files = sorted(glob.glob(str(FEATURE_DIR / "*.npz")))
total_files = len(all_npz_files)

if total_files == 0:
    print("❌ No .npz files in features directory!")
    print(f"Directory: {FEATURE_DIR}")
    print("\nContents:")
    !ls -la {FEATURE_DIR}
    raise FileNotFoundError("No .npz files found")

print(f"✓ Found {total_files:,} .npz files")

# Check file sizes
sample_sizes = [Path(f).stat().st_size for f in all_npz_files[:10]]
avg_size = sum(sample_sizes) / len(sample_sizes)
print(f"  Average file size: {avg_size/1024:.1f} KB")

# Create output directory
OUTPUT_DIR = Path("/content/models/run_01")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Save config
import json
config = {
    'ROOT_DIR': str(ROOT_DIR),
    'FEATURE_DIR': str(FEATURE_DIR),
    'OUTPUT_DIR': str(OUTPUT_DIR),
    'total_files': total_files,
    'zip_path': ZIP_PATH
}

config_path = OUTPUT_DIR / "config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

# Summary
print("\n" + "="*70)
print("SETUP COMPLETE")
print("="*70)
print(f"ZIP extracted from: {ZIP_PATH}")
print(f"ROOT_DIR:           {ROOT_DIR}")
print(f"FEATURE_DIR:        {FEATURE_DIR}")
print(f"OUTPUT_DIR:         {OUTPUT_DIR}")
print(f"Total .npz files:   {total_files:,}")
print("="*70)

print("\n✅ Cell 1 Complete - Ready for Cell 2")

CELL 1: Environment Setup & Data Loading

[1/5] Mounting Google Drive...
Mounted at /content/drive
✓ Drive mounted

[2/5] Checking dataset status...
Dataset not found, will extract from: /content/drive/MyDrive/STA 160/dataset/aivideo-dataset.zip

[3/5] Extracting dataset...
  Installing 7z...
  Creating extraction directory...
  Extracting features and META files...
  (This may take 2-3 minutes...)
  ✓ Extraction complete

[4/5] Locating features...
✓ Found features at: /content/aivideo-dataset/features_logmel_sr16k_v1
✓ Root directory: /content/aivideo-dataset

[5/5] Counting files...
✓ Found 9,565 .npz files
  Average file size: 5130.3 KB

SETUP COMPLETE
ZIP extracted from: /content/drive/MyDrive/STA 160/dataset/aivideo-dataset.zip
ROOT_DIR:           /content/aivideo-dataset
FEATURE_DIR:        /content/aivideo-dataset/features_logmel_sr16k_v1
OUTPUT_DIR:         /content/models/run_01
Total .npz files:   9,565

✅ Cell 1 Complete - Ready for Cell 2


In [None]:
#!/usr/bin/env python3
"""
CELL 2: Clean Orphaned Files & Select 4,000 Samples
====================================================

Purpose:
- Load configuration from Cell 1
- Validate all feature files
- Remove corrupted/orphaned files
- Select exactly 4,000 good samples
- Save list of valid files

Output:
- feature_list_4k.txt: List of 4,000 valid feature paths
- Cleaned feature directory
"""

import os
import json
import glob
import numpy as np
from pathlib import Path
from tqdm import tqdm

print("="*70)
print("CELL 2: Clean Orphaned Files & Select 4,000 Samples")
print("="*70)

# ==================== LOAD CONFIG ====================
print("\n[1/6] Loading configuration...")

OUTPUT_DIR = Path("/content/models/run_01")
config_path = OUTPUT_DIR / "config.json"

if not config_path.exists():
    raise FileNotFoundError("Config not found. Run Cell 1 first!")

with open(config_path, 'r') as f:
    config = json.load(f)

ROOT_DIR = Path(config['ROOT_DIR'])
FEATURE_DIR = Path(config['FEATURE_DIR'])

print(f"✓ FEATURE_DIR: {FEATURE_DIR}")
print(f"✓ Total files: {config['total_files']:,}")

# ==================== VALIDATE FEATURES ====================
print("\n[2/6] Validating feature files...")

MEL_BINS = 128
MIN_FRAMES = 32

def validate_feature_file(filepath):
    """
    Validate a feature file.

    Returns:
        (is_valid, reason, shape) tuple
    """
    try:
        # Check file size first
        size = Path(filepath).stat().st_size
        if size < 1024:  # Less than 1KB
            return False, "too_small", None

        # Try to load
        if str(filepath).endswith('.npy'):
            arr = np.load(filepath, allow_pickle=False)
        else:
            with np.load(filepath, allow_pickle=False) as data:
                # Try common key names
                arr = None
                for key in ['logmel', 'log_mel', 'mel', 'features', 'x', 'S']:
                    if key in data:
                        arr = data[key]
                        break

                # If no common key, take first array
                if arr is None:
                    for key in data.files:
                        if isinstance(data[key], np.ndarray):
                            arr = data[key]
                            break

        if arr is None:
            return False, "no_array", None

        # Check dimensionality
        if arr.ndim == 3 and 1 in arr.shape:
            arr = arr.squeeze()

        if arr.ndim != 2:
            return False, f"wrong_dims_{arr.ndim}D", arr.shape

        # Check mel bins
        if arr.shape[0] != MEL_BINS and arr.shape[1] == MEL_BINS:
            arr = arr.T

        if arr.shape[0] != MEL_BINS:
            return False, f"wrong_mels_{arr.shape[0]}", arr.shape

        # Check time frames
        if arr.shape[1] < MIN_FRAMES:
            return False, f"too_short_{arr.shape[1]}", arr.shape

        # Check for NaN/Inf
        if not np.isfinite(arr).all():
            return False, "nan_inf", arr.shape

        return True, "valid", arr.shape

    except Exception as e:
        return False, f"error_{type(e).__name__}", None

# Get all files
all_files = sorted(glob.glob(str(FEATURE_DIR / "*.npz")))
print(f"  Found {len(all_files):,} .npz files")

# Validate each file
print("  Validating files...")
valid_files = []
invalid_files = []
reasons = {}

for filepath in tqdm(all_files, desc="Validating"):
    is_valid, reason, shape = validate_feature_file(filepath)

    if is_valid:
        valid_files.append(filepath)
    else:
        invalid_files.append((filepath, reason, shape))
        reasons[reason] = reasons.get(reason, 0) + 1

print(f"\n✓ Valid files: {len(valid_files):,}")
print(f"✗ Invalid files: {len(invalid_files):,}")

if reasons:
    print("\nInvalid file reasons:")
    for reason, count in sorted(reasons.items(), key=lambda x: -x[1]):
        print(f"  {reason}: {count}")

# ==================== REMOVE ORPHANED FILES ====================
print(f"\n[3/6] Removing orphaned/corrupted files...")

if len(invalid_files) > 0:
    # Create backup directory
    backup_dir = FEATURE_DIR.parent / "features_backup_bad"
    backup_dir.mkdir(exist_ok=True)

    removed_count = 0
    for filepath, reason, shape in invalid_files[:100]:  # Limit to 100 for safety
        try:
            # Move to backup instead of deleting
            filename = Path(filepath).name
            backup_path = backup_dir / filename
            os.rename(filepath, backup_path)
            removed_count += 1
        except Exception as e:
            print(f"  Warning: Could not move {filename}: {e}")

    print(f"✓ Moved {removed_count} bad files to backup")
    print(f"  Backup location: {backup_dir}")
else:
    print("✓ No orphaned files to remove")

# ==================== SELECT 4,000 SAMPLES ====================
print(f"\n[4/6] Selecting 4,000 samples...")

TARGET_COUNT = 4000

if len(valid_files) < TARGET_COUNT:
    print(f"⚠️  Warning: Only {len(valid_files):,} valid files available")
    print(f"   Using all {len(valid_files):,} files instead of {TARGET_COUNT}")
    selected_files = valid_files
else:
    # Random sampling for diversity
    import random
    random.seed(2025)
    selected_files = random.sample(valid_files, TARGET_COUNT)
    selected_files.sort()  # Sort for reproducibility
    print(f"✓ Selected {len(selected_files):,} files randomly")

# ==================== VERIFY SELECTION ====================
print(f"\n[5/6] Verifying selection...")

# Check distribution
sample_shapes = []
for filepath in selected_files[:100]:  # Check first 100
    _, _, shape = validate_feature_file(filepath)
    if shape:
        sample_shapes.append(shape[1])  # Time frames

if sample_shapes:
    import numpy as np
    print(f"  Time frames - Min: {min(sample_shapes)}, Max: {max(sample_shapes)}, Mean: {np.mean(sample_shapes):.0f}")

print(f"✓ Verification complete")

# ==================== SAVE FILE LIST ====================
print(f"\n[6/6] Saving file list...")

output_file = OUTPUT_DIR / "feature_list_4k.txt"
with open(output_file, 'w') as f:
    for filepath in selected_files:
        f.write(filepath + '\n')

print(f"✓ Saved to: {output_file}")

# Update config
config['valid_files'] = len(valid_files)
config['selected_files'] = len(selected_files)
config['feature_list'] = str(output_file)

with open(OUTPUT_DIR / "config.json", 'w') as f:
    json.dump(config, f, indent=2)

# ==================== SUMMARY ====================
print("\n" + "="*70)
print("CLEANING COMPLETE")
print("="*70)
print(f"Total files found:    {len(all_files):,}")
print(f"Valid files:          {len(valid_files):,}")
print(f"Invalid files:        {len(invalid_files):,}")
print(f"Selected for dataset: {len(selected_files):,}")
print(f"File list saved:      {output_file}")
print("="*70)

print("\n✅ Cell 2 Complete - Ready for Cell 3")



CELL 2: Clean Orphaned Files & Select 4,000 Samples

[1/6] Loading configuration...
✓ FEATURE_DIR: /content/aivideo-dataset/features_logmel_sr16k_v1
✓ Total files: 9,565

[2/6] Validating feature files...
  Found 9,565 .npz files
  Validating files...


Validating: 100%|██████████| 9565/9565 [08:31<00:00, 18.71it/s]



✓ Valid files: 9,565
✗ Invalid files: 0

[3/6] Removing orphaned/corrupted files...
✓ No orphaned files to remove

[4/6] Selecting 4,000 samples...
✓ Selected 4,000 files randomly

[5/6] Verifying selection...
  Time frames - Min: 9005, Max: 64989, Mean: 24086
✓ Verification complete

[6/6] Saving file list...
✓ Saved to: /content/models/run_01/feature_list_4k.txt

CLEANING COMPLETE
Total files found:    9,565
Valid files:          9,565
Invalid files:        0
Selected for dataset: 4,000
File list saved:      /content/models/run_01/feature_list_4k.txt

✅ Cell 2 Complete - Ready for Cell 3


In [None]:
#!/usr/bin/env python3
"""
CELL 3: Build Master with Quality Signals (case-safe, META+Kaggle)
------------------------------------------------------------------
- Loads feature list produced earlier (4k paths)
- Extracts video_id from filenames (PRESERVE ORIGINAL CASE)
- Loads META.parquet and Kaggle CSV
- Joins on a case-insensitive helper key (video_id_upper) but keeps original case
- Computes: user_engagement_signal, platform_quality_signal, quality_final
"""

import os, json, re
from pathlib import Path
import pandas as pd
import numpy as np
from urllib.parse import urlparse, parse_qs

print("="*70)
print("CELL 3: Build Master with Quality Signals")
print("="*70)

# ---------------- [1/6] Load configuration ----------------
print("\n[1/6] Loading configuration...")
OUTPUT_DIR = Path("/content/models/run_01")
CONFIG_PATH = OUTPUT_DIR / "config.json"
cfg = json.loads(CONFIG_PATH.read_text())

ROOT_DIR = Path(cfg.get("ROOT_DIR", "/content/aivideo-dataset"))
FEATURE_DIR = Path(cfg.get("feature_dir", str(ROOT_DIR / "features_logmel_sr16k_v1")))
feature_list_file = cfg["feature_list"]   # created by your previous cell

# ---------------- [2/6] Load feature list ----------------
print("\n[2/6] Loading feature list...")
with open(feature_list_file, "r") as f:
    feature_paths = [line.strip() for line in f if line.strip()]

df_features = pd.DataFrame({"feature_path": feature_paths})
df_features["filename"] = df_features["feature_path"].apply(lambda x: Path(x).name)
# DO NOT upper-case here (YouTube IDs are case-sensitive)
df_features["video_id"] = df_features["filename"].str.replace(".npz", "", regex=False)

print(f"✓ Loaded {len(df_features):,} features")

# ---------------- [3/6] Load META & Kaggle ----------------
print("\n[3/6] Loading META.parquet & Kaggle CSV...")

# Locate META
meta_candidates = [
    ROOT_DIR / "META.parquet",
    Path("/content/aivideo-dataset/META.parquet"),
    Path("/content/drive/MyDrive/STA 160/dataset/META.parquet"),
]
meta_path = next((p for p in meta_candidates if p.exists()), None)
if meta_path is None:
    # fallback: search
    found = list(ROOT_DIR.glob("**/META*.parquet"))
    meta_path = found[0] if found else None
if meta_path is None:
    raise FileNotFoundError("META.parquet not found")

meta = pd.read_parquet(meta_path)
print(f"✓ META rows: {len(meta):,}")

# Normalize META IDs (create helper key, keep original column untouched)
if "video_id" not in meta.columns:
    for alt in ["id", "videoId", "yt_video_id", "youtube_id"]:
        if alt in meta.columns:
            meta = meta.rename(columns={alt: "video_id"})
            print(f"  Using '{alt}' as video_id")
            break
if "video_id" not in meta.columns:
    meta["video_id"] = pd.NA

meta["video_id"] = meta["video_id"].astype(str)
meta["video_id_upper"] = meta["video_id"].str.upper()

# Ensure META count columns exist (rename common aliases)
alias_map = {
    "views": ["views", "view_count", "yt_views", "Views"],
    "likes": ["likes", "like_count", "yt_likes", "Likes"],
    "comments": ["comments", "comment_count", "yt_comments", "Comments"],
}
for tgt, aliases in alias_map.items():
    if tgt not in meta.columns:
        for a in aliases:
            if a in meta.columns:
                meta = meta.rename(columns={a: tgt})
                break
for col in ["views","likes","comments"]:
    if col in meta.columns:
        meta[col] = pd.to_numeric(meta[col], errors="coerce")

# Load Kaggle (optional enrichment)
kag_path = Path("/content/drive/My Drive/STA 160/Spotify Youtube Dataset.csv")
if kag_path.exists():
    kag = pd.read_csv(kag_path)
    def extract_vid(u):
        if pd.isna(u): return None
        u = str(u)
        if "youtube.com/watch" in u:
            return parse_qs(urlparse(u).query).get("v", [None])[0]
        if "youtu.be/" in u:
            return u.split("youtu.be/")[-1].split("?")[0]
        m = re.search(r"[?&]v=([^&]+)", u)
        return m.group(1) if m else None
    kag["video_id"] = kag["Url_youtube"].apply(extract_vid).astype(str)
    kag["video_id_upper"] = kag["video_id"].str.upper()

    # standardize columns
    for src, dst in [("Views","views"),("Likes","likes"),("Comments","comments"),
                     ("Artist","artist"),("Track","track")]:
        if src in kag.columns:
            kag = kag.rename(columns={src: dst})
    for c in ["views","likes","comments"]:
        if c in kag.columns:
            kag[c] = pd.to_numeric(kag[c], errors="coerce")
else:
    kag = pd.DataFrame(columns=["video_id","video_id_upper","views","likes","comments","artist","track"])

# ---------------- [4/6] Match features to META/Kaggle ----------------
print("\n[4/6] Matching features to metadata...")

df_features["video_id_upper"] = df_features["video_id"].astype(str).str.upper()

keep_meta = ["video_id","video_id_upper","views","likes","comments","channel","yt_channel","published_date"]
keep_meta = [c for c in keep_meta if c in meta.columns]
m = df_features.merge(meta[keep_meta], on="video_id_upper", how="left", suffixes=("","_meta"))

# Fill from Kaggle where META missing
if not kag.empty:
    keep_kag = ["video_id_upper","views","likes","comments","artist","track"]
    keep_kag = [c for c in keep_kag if c in kag.columns]
    m = m.merge(kag[keep_kag], on="video_id_upper", how="left", suffixes=("","_kag"))
    # prefer META first, then Kaggle
    for col in ["views","likes","comments"]:
        src_meta = col
        src_kag  = f"{col}_kag"
        if src_meta not in m.columns and src_kag in m.columns:
            m[src_meta] = np.nan
        if src_meta in m.columns and src_kag in m.columns:
            m[src_meta] = m[src_meta].where(m[src_meta].notna(), m[src_kag])

matched = int(m["views"].notna().sum())
print(f"✓ Matched {matched:,} / {len(m):,} ({matched/len(m)*100:.1f}%)")

# Keep only rows with all three counts
for c in ["likes","comments"]:
    if c not in m.columns:
        m[c] = np.nan
master = m.dropna(subset=["views","likes","comments"], how="any").reset_index(drop=True)
print(f"✓ After filtering: {len(master):,} rows")

# ---------------- [5/6] Compute quality signals ----------------
print("\n[5/6] Computing quality signals...")

master["views_log"] = np.log1p(master["views"].fillna(0))

# channel key
if "channel" in master.columns and master["channel"].notna().any():
    ch_key = "channel"
elif "yt_channel" in master.columns and master["yt_channel"].notna().any():
    ch_key = "yt_channel"
else:
    master["_channel"] = "all"
    ch_key = "_channel"

# week key
if "published_date" in master.columns:
    master["_week"] = pd.to_datetime(master["published_date"], errors="coerce").dt.to_period("W").astype(str)
else:
    master["_week"] = "all"

# Laplace-smoothed engagement rate
k_prior = 4.0
likes_rate    = (master["likes"]    + 1) / (master["views"] + k_prior)
comments_rate = (master["comments"] + 1) / (master["views"] + k_prior)
eng_rate = 0.8 * likes_rate + 0.2 * comments_rate
master["engagement_rate"] = eng_rate

# User engagement signal (within-week percentile → channel-normalized z)
eng_rank = master.groupby("_week")["engagement_rate"].rank(pct=True)
er_mu = eng_rank.groupby(master[ch_key]).transform("mean")
er_sd = eng_rank.groupby(master[ch_key]).transform("std").replace(0, 1)
user_eng = (eng_rank - er_mu) / er_sd
user_eng = user_eng.fillna(0)

# Platform quality: views_log residual within week, normalized by channel
wk_mu = master.groupby("_week")["views_log"].transform("mean")
resid = master["views_log"] - wk_mu
rw_mu = resid.groupby(master[ch_key]).transform("mean")
rw_sd = resid.groupby(master[ch_key]).transform("std").replace(0, 1)
plat_qual = (resid - rw_mu) / rw_sd
# fallback to simple channel z if needed
ch_mu = master.groupby(ch_key)["views_log"].transform("mean")
ch_sd = master.groupby(ch_key)["views_log"].transform("std").replace(0, 1)
z_ch = (master["views_log"] - ch_mu) / ch_sd
plat_qual = plat_qual.fillna(z_ch)

# Blend & winsorize
qual_final = 0.6 * plat_qual + 0.4 * user_eng
lo, hi = np.nanquantile(qual_final, [0.01, 0.99])
qual_final = np.clip(qual_final, lo, hi)

# Standardize signals
def standardize(x):
    x = pd.Series(x).astype(float)
    m, s = x.mean(), x.std()
    return (x - m) / (s if s > 0 else 1)

master["user_engagement_signal"] = standardize(user_eng).astype("float32")
master["platform_quality_signal"] = standardize(plat_qual).astype("float32")
master["quality_final"] = standardize(qual_final).astype("float32")

# Clean temp cols
master.drop(columns=[c for c in ["_channel","_week","views_log","engagement_rate"] if c in master.columns], inplace=True)

print("✓ Quality signals computed")
print(f"  Samples: {len(master):,}")

# ---------------- [6/6] Save & update config ----------------
print("\n[6/6] Saving master metadata...")
out_file = OUTPUT_DIR / "META_master_4k.parquet"
master.to_parquet(out_file, index=False)

cfg["master_file"] = str(out_file)
cfg["master_rows"] = int(len(master))
cfg["targets"] = ["user_engagement_signal","platform_quality_signal","quality_final"]
CONFIG_PATH.write_text(json.dumps(cfg, indent=2))

print("\n" + "="*70)
print("METADATA COMPLETE")
print("="*70)
print(f"Samples:  {len(master):,}")
print(f"Targets:  {cfg['targets']}")
print(f"Output:   {out_file}")
print("="*70)

print("\nTarget statistics:")
for t in cfg["targets"]:
    vals = master[t]
    print(f"  {t}: mean={vals.mean():.3f}  std={vals.std():.3f}  min={vals.min():.3f}  max={vals.max():.3f}")
print("\n✅ Cell 3 Complete")


CELL 3: Build Master with Quality Signals

[1/6] Loading configuration...

[2/6] Loading feature list...
✓ Loaded 4,000 features

[3/6] Loading META.parquet & Kaggle CSV...
✓ META rows: 9,578

[4/6] Matching features to metadata...
✓ Matched 1,607 / 4,200 (38.3%)
✓ After filtering: 1,598 rows

[5/6] Computing quality signals...
✓ Quality signals computed
  Samples: 1,598

[6/6] Saving master metadata...

METADATA COMPLETE
Samples:  1,598
Targets:  ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
Output:   /content/models/run_01/META_master_4k.parquet

Target statistics:
  user_engagement_signal: mean=0.000  std=1.000  min=-1.730  max=1.730
  platform_quality_signal: mean=-0.000  std=1.000  min=-3.965  max=1.752
  quality_final: mean=0.000  std=1.000  min=-2.413  max=1.759

✅ Cell 3 Complete


In [None]:
# ============================================================
# REBUILD + ENRICH MASTER (META + Kaggle) → TRAINING SUBSET
# One cell. Safe to re-run. Non-destructive.
# ============================================================
import pandas as pd
import numpy as np
from pathlib import Path
import glob, re
from urllib.parse import urlparse, parse_qs

# ---------- Config ----------
FEATURE_DIR = Path("/content/aivideo-dataset/features_logmel_sr16k_v1")
META_PATH   = Path("/content/aivideo-dataset/META.parquet")
KAGGLE_CSV  = Path("/content/drive/My Drive/STA 160/Spotify Youtube Dataset.csv")

OUTDIR = Path("/content/models/run_01")
OUTDIR.mkdir(parents=True, exist_ok=True)
MASTER_OUT   = OUTDIR / "META_final_clean.parquet"
TRAIN_OUT    = OUTDIR / "META_training_ready.parquet"
ENRICHED_OUT = OUTDIR / "META_final_enriched.parquet"   # also write an enriched copy

print("\n" + "="*70)
print("REBUILDING + ENRICHING MASTER (META + KAGGLE)")
print("="*70)

# ---------- Helpers ----------
def extract_video_id(url):
    if pd.isna(url):
        return None
    try:
        url = str(url)
        if "youtube.com/watch" in url:
            q = parse_qs(urlparse(url).query)
            vid = q.get("v", [None])[0]
            return vid
        if "youtu.be/" in url:
            return url.split("youtu.be/")[-1].split("?")[0]
        m = re.search(r"[?&]v=([^&]+)", url)
        if m:
            return m.group(1)
    except Exception:
        pass
    return None

def coerce_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

# ---------- Load META ----------
meta = pd.read_parquet(META_PATH)
print(f"✓ Loaded META: {len(meta):,} rows")

# Normalize/alias video_id
if "video_id" not in meta.columns:
    for alt in ["id","videoId","yt_video_id","youtube_id"]:
        if alt in meta.columns:
            meta = meta.rename(columns={alt: "video_id"})
            break
meta["video_id_upper"] = meta["video_id"].astype(str).str.upper()

# Standardize target columns in META
count_aliases = {
    "views":    ["views","view_count","yt_views","Views"],
    "likes":    ["likes","like_count","yt_likes","Likes"],
    "comments": ["comments","comment_count","yt_comments","Comments"],
}
for tgt, alts in count_aliases.items():
    if tgt not in meta.columns:
        for a in alts:
            if a in meta.columns:
                meta = meta.rename(columns={a: tgt})
                break
coerce_numeric(meta, ["views","likes","comments"])

# ---------- Load feature file list ----------
npz_files = sorted(glob.glob(str(FEATURE_DIR / "*.npz")))
df_features = pd.DataFrame({"feature_path": npz_files})
df_features["basename"] = df_features["feature_path"].map(lambda p: Path(p).name)
df_features["video_id"] = df_features["basename"].str.replace(".npz","",regex=False)
df_features["video_id_upper"] = df_features["video_id"].astype(str).str.upper()

print(f"✓ Found NPZ features: {len(df_features):,}")

# ---------- Base merge: features ⟵ META ----------
keep_meta_cols = ["video_id_upper","views","likes","comments","artist","track","channel","yt_channel","published_date"]
keep_meta_cols = [c for c in keep_meta_cols if c in meta.columns]
master = df_features.merge(meta[keep_meta_cols], on="video_id_upper", how="left")

# Coverage before enrichment
base_all = int(master[["views","likes","comments"]].notna().all(axis=1).sum())
print(f"→ After META join, with ALL 3 targets: {base_all:,}/{len(master):,} "
      f"({base_all/len(master)*100:.1f}%)")

# ---------- Kaggle enrichment (fill only missing) ----------
print("\nEnriching from Kaggle CSV (fill missing targets only)…")
try:
    kaggle = pd.read_csv(KAGGLE_CSV)
except Exception as e:
    kaggle = pd.DataFrame()
    print(f"⚠ Kaggle CSV not loaded: {e}")

if len(kaggle):
    # Extract video_id from URLs if needed
    if "video_id" not in kaggle.columns:
        url_col = None
        for c in ["Url_youtube","url_youtube","youtube_url","yt_url","url"]:
            if c in kaggle.columns:
                url_col = c; break
        if url_col:
            kaggle["video_id"] = kaggle[url_col].apply(extract_video_id)
        else:
            kaggle["video_id"] = None

    kaggle["video_id_upper"] = kaggle["video_id"].astype(str).str.upper()
    # Standardize counts
    for tgt, alts in count_aliases.items():
        if tgt not in kaggle.columns:
            for a in alts:
                if a in kaggle.columns:
                    kaggle = kaggle.rename(columns={a: tgt})
                    break
    coerce_numeric(kaggle, ["views","likes","comments"])

    keep_kag_cols = ["video_id_upper","views","likes","comments","Artist","Track"]
    keep_kag_cols = [c for c in keep_kag_cols if c in kaggle.columns]
    ksub = kaggle[keep_kag_cols].drop_duplicates("video_id_upper")

    # Merge (left) & fill only where META is missing
    master = master.merge(ksub, on="video_id_upper", how="left", suffixes=("","_kag"))
    for tgt in ["views","likes","comments"]:
        if f"{tgt}_kag" in master.columns:
            master[tgt] = master[tgt].where(master[tgt].notna(), master[f"{tgt}_kag"])
            master.drop(columns=[f"{tgt}_kag"], inplace=True, errors="ignore")

    # informational enrichment (non-essential)
    if "Artist" in master.columns and "artist" in master.columns:
        master["artist"] = master["artist"].where(master["artist"].notna(), master["Artist"])
        master.drop(columns=["Artist"], inplace=True, errors="ignore")
    if "Track" in master.columns and "track" in master.columns:
        master["track"] = master["track"].where(master["track"].notna(), master["Track"])
        master.drop(columns=["Track"], inplace=True, errors="ignore")

# ---------- Provenance & coverage report ----------
prov = {}
for tgt in ["views","likes","comments"]:
    src_meta = tgt in meta.columns
    src_kag  = tgt in kaggle.columns if len(kaggle) else False
    if src_meta and src_kag and f"{tgt}_kag" not in master.columns:
        # we already filled; estimate by presence before fill
        # build temporary booleans from available data
        prov[f"{tgt}_from_meta"] = int((df_features.merge(meta[["video_id_upper",tgt]], on="video_id_upper", how="left")[tgt].notna()).sum())
        prov[f"{tgt}_from_kaggle"] = int((master[tgt].notna().sum()) - prov[f"{tgt}_from_meta"])
    else:
        prov[f"{tgt}_from_meta"]   = int(master[tgt].notna().sum()) if src_meta else 0
        prov[f"{tgt}_from_kaggle"] = 0

has_all = master[["views","likes","comments"]].notna().all(axis=1)
print("\n" + "-"*70)
print("COVERAGE SUMMARY")
print("-"*70)
print(f"Total NPZ: {len(master):,}")
print(f"With ALL targets (after enrichment): {has_all.sum():,} ({has_all.sum()/len(master)*100:.1f}%)")
for tgt in ["views","likes","comments"]:
    present = master[tgt].notna().sum()
    print(f"  {tgt:<9}: {present:6,} present")

# ---------- Save outputs ----------
# Full master (may include rows with missing targets)
master.to_parquet(MASTER_OUT, index=False)
# Training subset (must have all three)
train_ready = master[has_all].copy()
train_ready.to_parquet(TRAIN_OUT, index=False)
# Also keep an explicit “enriched” copy (alias of MASTER_OUT for downstream cells)
master.to_parquet(ENRICHED_OUT, index=False)

print("\n" + "="*70)
print("FILES WRITTEN")
print("="*70)
print(f"MASTER : {MASTER_OUT}")
print(f"TRAIN  : {TRAIN_OUT}")
print(f"ENRICH : {ENRICHED_OUT}")

# ---------- Final summary ----------
print("\n" + "="*70)
print("FINAL DATASET SUMMARY")
print("="*70)
print(f"Total NPZ files: {len(master):,}")
print(f"Matched video_ids (any targets): {(master['video_id'].notna()).sum():,}")
print(f"With all targets (views/likes/comments): {len(train_ready):,}")
print(f"Coverage: {len(train_ready)/len(master)*100:.1f}%")

# Sample preview
display_cols = ['video_id', 'feature_path', 'views', 'likes', 'comments', 'artist', 'track']
print("\nSAMPLE (first 10 rows with targets):")
print(train_ready[[c for c in display_cols if c in train_ready.columns]].head(10).to_string(index=False))
print("\n✅ DATASET READY FOR TRAINING")



REBUILDING + ENRICHING MASTER (META + KAGGLE)
✓ Loaded META: 9,578 rows
✓ Found NPZ features: 9,565
→ After META join, with ALL 3 targets: 3,286/9,578 (34.3%)

Enriching from Kaggle CSV (fill missing targets only)…

----------------------------------------------------------------------
COVERAGE SUMMARY
----------------------------------------------------------------------
Total NPZ: 9,578
With ALL targets (after enrichment): 3,336 (34.8%)
  views    :  3,363 present
  likes    :  3,349 present
  comments :  3,349 present

FILES WRITTEN
MASTER : /content/models/run_01/META_final_clean.parquet
TRAIN  : /content/models/run_01/META_training_ready.parquet
ENRICH : /content/models/run_01/META_final_enriched.parquet

FINAL DATASET SUMMARY
Total NPZ files: 9,578
Matched video_ids (any targets): 9,578
With all targets (views/likes/comments): 3,336
Coverage: 34.8%

SAMPLE (first 10 rows with targets):
   video_id                                                      feature_path       views     

In [None]:
# ==============================================================
# Bootstrap: safely resolve config + master_file before target gen
# ==============================================================

import json
from pathlib import Path

# Fallback default directory
OUTPUT_DIR = Path(globals().get("OUTPUT_DIR", "/content/drive/MyDrive/STA 160/models/run_01"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

config_file = OUTPUT_DIR / "config.json"

# Load existing config or initialize new one
if config_file.exists():
    with open(config_file, "r") as f:
        config = json.load(f)
else:
    config = {}

# Try to find a master file
master_file = config.get("master_file", None)

if master_file is None or not Path(master_file).exists():
    candidates = [
        "/content/models/run_01/META_training_clean.parquet",
        "/content/drive/MyDrive/STA 160/META_training_clean.parquet",
        "/content/drive/MyDrive/META_clean_v2.parquet",
        "/content/drive/MyDrive/STA 160/models/run_01/META_master.parquet",
    ]
    master_file = next((p for p in candidates if Path(p).exists()), None)

if master_file is None:
    raise FileNotFoundError("Cannot locate master parquet. Place it in /content/drive/MyDrive/STA 160/.")

# Normalize to Path and save back to config
master_file = str(Path(master_file))
config["master_file"] = master_file
with open(config_file, "w") as f:
    json.dump(config, f, indent=2)

print(f"✓ Using master_file: {master_file}")


✓ Using master_file: /content/models/run_01/META_master_4k.parquet


In [None]:
# ======================================================================
# CELL 2.9: Target generation & verification (run this before Cell 3)
# ======================================================================
import numpy as np
import pandas as pd
from pathlib import Path

# 1) Load the current master parquet (use the same 'master_file' you already resolved)
_master_path = Path(config.get('master_file', master_file))
assert _master_path.exists(), f"Master file not found: {_master_path}"
master = pd.read_parquet(_master_path)
print(f"Loaded master: {len(master):,} rows, {master.columns.size} columns")

# 2) Utility helpers
def safe_z(x):
    x = pd.to_numeric(x, errors='coerce')
    m = x.mean(skipna=True)
    s = x.std(skipna=True)
    if not np.isfinite(m): m = 0.0
    if not np.isfinite(s) or s < 1e-6: s = 1.0
    return (x - m) / s

def nz(v, fill=0.0):
    v = pd.to_numeric(v, errors='coerce')
    return v.fillna(fill)

def exists(col):
    return col in master.columns

# 3) Create common engineered columns (only if present → no KeyErrors)
if exists('likes'):
    master['likes_log'] = np.log1p(nz(master['likes']))
if exists('views'):
    master['views_log'] = np.log1p(nz(master['views']))
if exists('comments'):
    master['comments_log'] = np.log1p(nz(master['comments']))

# Engagement rate proxy (robust to missing cols)
likes = nz(master['likes']) if exists('likes') else 0.0
comments = nz(master['comments']) if exists('comments') else 0.0
views = nz(master['views']).replace(0, np.nan) if exists('views') else np.nan
eng_rate = (likes + 0.5 * comments) / views
master['engagement_rate'] = eng_rate.replace([np.inf, -np.inf], np.nan)

# 4) Build TARGETS if missing (derivations are proxies; replace with your official formulas when available)
need_user = 'user_engagement_signal' not in master.columns
need_platform = 'platform_quality_signal' not in master.columns
need_quality = 'quality_final' not in master.columns

# user_engagement_signal: emphasize engagement metrics
if need_user:
    parts = []
    if 'likes_log' in master: parts.append(safe_z(master['likes_log']))
    if 'comments_log' in master: parts.append(safe_z(master['comments_log']))
    if 'engagement_rate' in master: parts.append(safe_z(master['engagement_rate']))
    if parts:
        master['user_engagement_signal'] = np.nanmean(np.vstack(parts), axis=0)
        print("Created user_engagement_signal from available components.")
    else:
        master['user_engagement_signal'] = np.nan

# platform_quality_signal: softer blend with scale exposure (views)
if need_platform:
    parts = []
    if 'views_log' in master: parts.append(0.6 * safe_z(master['views_log']))
    if 'engagement_rate' in master: parts.append(0.4 * safe_z(master['engagement_rate']))
    if parts:
        master['platform_quality_signal'] = np.nansum(np.vstack(parts), axis=0)
        print("Created platform_quality_signal from available components.")
    else:
        master['platform_quality_signal'] = np.nan

# quality_final: balanced composite (tune weights as you like)
if need_quality:
    parts = []
    if 'user_engagement_signal' in master: parts.append(0.5 * safe_z(master['user_engagement_signal']))
    if 'platform_quality_signal' in master: parts.append(0.5 * safe_z(master['platform_quality_signal']))
    # optional bonus: recency decay if you have days_since_publish
    if exists('days_since_publish'):
        # newer content gets small boost; cap effect to be gentle
        dsp = pd.to_numeric(master['days_since_publish'], errors='coerce')
        recency = -safe_z(np.log1p(dsp.clip(lower=0)))
        parts.append(0.2 * recency)
    if parts:
        master['quality_final'] = np.nansum(np.vstack(parts), axis=0)
        print("Created quality_final composite target.")
    else:
        master['quality_final'] = np.nan

# 5) Final cleaning: coerce to float32, drop rows with all-targets-missing
TARGETS = ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
for t in TARGETS:
    master[t] = pd.to_numeric(master[t], errors='coerce').astype('float32')

non_null_counts = master[TARGETS].notna().sum()
print("Non-null counts:", dict(non_null_counts))

# Keep rows that have at least one target (you can require 'all' if you prefer)
mask_any = master[TARGETS].notna().any(axis=1)
kept = int(mask_any.sum())
print(f"Rows with ≥1 target present: {kept:,} / {len(master):,}")

# Hard fail if everything is empty → prevents training on random weights
if non_null_counts.max() == 0:
    raise RuntimeError(
        "All requested targets are empty. Either run your official Preprocessing "
        "to populate them or switch TARGETS to existing columns (e.g., likes_log/views_log)."
    )

# 6) Persist the enriched master and update config
enriched_path = _master_path.with_name(_master_path.stem + "_enriched.parquet")
master.to_parquet(enriched_path, index=False)
config['master_file'] = str(enriched_path)
config['targets'] = TARGETS
with open(config_file, 'w') as f:
    import json; json.dump(config, f, indent=2)
print(f"✓ Enriched master written to: {enriched_path}")
print("✓ Config updated with targets & new master_file")

# 7) Hand-off to your existing Cell 3+:
#    Downstream code should read config['master_file'] and config['targets'].


Loaded master: 1,598 rows, 18 columns
Non-null counts: {'user_engagement_signal': np.int64(1598), 'platform_quality_signal': np.int64(1598), 'quality_final': np.int64(1598)}
Rows with ≥1 target present: 1,598 / 1,598
✓ Enriched master written to: /content/models/run_01/META_master_4k_enriched.parquet
✓ Config updated with targets & new master_file


In [None]:
#!/usr/bin/env python3
"""
CELL 3: Build Master with Quality Signals (with duration-ID recovery)
====================================================================

- Load features from config['feature_list']
- Recover video_id from filename; fill missing via /content/sha1_ALL_matched_final.csv
- Join META by video_id (uppercased consistently)
- Compute: user_engagement_signal, platform_quality_signal, quality_final
"""

import os, json, re, glob
from pathlib import Path
import numpy as np
import pandas as pd

print("="*70)
print("CELL 3: Build Master with Quality Signals (duration-aware)")
print("="*70)

# ---------------- Config ----------------
print("\n[1/6] Loading configuration...")
OUTPUT_DIR = Path("/content/models/run_01")
with open(OUTPUT_DIR / "config.json", "r") as f:
    config = json.load(f)

feature_list_file = Path(config["feature_list"])
ROOT_DIR = Path(config["ROOT_DIR"])
DURATION_MAP = Path("/content/sha1_ALL_matched_final.csv")  # produced by your duration matching

# ---------------- Features ----------------
print("\n[2/6] Loading feature list...")
with open(feature_list_file, "r") as f:
    feature_paths = [line.strip() for line in f if line.strip()]
df_features = pd.DataFrame({"feature_path": feature_paths})
df_features["basename"] = df_features["feature_path"].map(lambda p: Path(p).name)
df_features["filename"] = df_features["basename"]

def is_ytid(s: str) -> bool:
    s = str(s)
    if len(s) != 11: return False
    # allow A-Z/a-z/0-9/-/_
    return all(ch.isalnum() or ch in "-_" for ch in s)

# video_id from filename stem
df_features["stem"] = df_features["basename"].str.replace(".npz", "", regex=False)
df_features["video_id"] = df_features["stem"].where(df_features["stem"].map(is_ytid))

# fill via duration map for non-IDs (sha1_* or word-named)
if DURATION_MAP.exists():
    dur = pd.read_csv(DURATION_MAP)
    # npz_basename -> video_id
    if "npz_basename" in dur.columns and "video_id" in dur.columns:
        dur = dur[["npz_basename", "video_id"]].dropna()
        dur["npz_basename"] = dur["npz_basename"].astype(str)
        dur["video_id"] = dur["video_id"].astype(str)
        df_features = df_features.merge(
            dur.rename(columns={"npz_basename": "basename", "video_id": "video_id_from_dur"}),
            on="basename", how="left"
        )
        df_features["video_id"] = df_features["video_id"].fillna(df_features["video_id_from_dur"])
        df_features.drop(columns=["video_id_from_dur"], inplace=True, errors="ignore")

print(f"✓ Loaded {len(df_features):,} features")

# ---------------- META ----------------
print("\n[3/6] Loading META.parquet...")
meta_candidates = [
    ROOT_DIR / "META.parquet",
    Path("/content/aivideo-dataset/META.parquet"),
    Path("/content/drive/MyDrive/STA 160/dataset/META.parquet"),
]
meta_path = next((p for p in meta_candidates if p.exists()), None)
if meta_path is None:
    found = glob.glob(str(ROOT_DIR / "**" / "META*.parquet"), recursive=True)
    meta_path = Path(found[0]) if found else None
if meta_path is None:
    raise FileNotFoundError("META.parquet not found")

print("  Found:", meta_path)
meta = pd.read_parquet(meta_path)
print(f"✓ Loaded {len(meta):,} rows")

# Normalize video_id column
if "video_id" not in meta.columns:
    for alt in ["id", "videoId", "yt_video_id", "youtube_id"]:
        if alt in meta.columns:
            meta = meta.rename(columns={alt: "video_id"})
            print(f"  Using '{alt}' as video_id")
            break

# Standardize casing for JOIN ONLY
meta["video_id_upper"] = meta["video_id"].astype(str).str.upper()
df_features["video_id_upper"] = df_features["video_id"].astype(str).str.upper()

# Normalize raw count column names and numeric types
alias_map = {
    "views":     ["views", "view_count", "yt_views", "Views"],
    "likes":     ["likes", "like_count", "yt_likes", "Likes"],
    "comments":  ["comments", "comment_count", "yt_comments", "Comments"],
}
for std, alts in alias_map.items():
    if std not in meta.columns:
        for c in alts:
            if c in meta.columns:
                meta = meta.rename(columns={c: std})
                break
for c in ["views", "likes", "comments"]:
    if c in meta.columns:
        meta[c] = pd.to_numeric(meta[c], errors="coerce")

print("✓ Found columns:",
      [c for c in ["views","likes","comments"] if c in meta.columns])

# ---------------- Match ----------------
print("\n[4/6] Matching features to metadata...")

keep_cols = ["video_id", "views", "likes", "comments"]
for opt in ["channel", "yt_channel", "published_date"]:
    if opt in meta.columns:
        keep_cols.append(opt)

# join using *_upper, but keep original meta['video_id']
meta_join = meta[["video_id_upper"] + [c for c in keep_cols if c != "video_id"]].copy()
df = df_features.merge(meta_join, on="video_id_upper", how="left")

matched = int(df["views"].notna().sum()) if "views" in df.columns else 0
print(f"✓ Matched {matched:,} / {len(df):,} ({matched/len(df)*100:.1f}%)")

# require all three counts
have_all = all(c in df.columns for c in ["views","likes","comments"])
master = df.dropna(subset=["views","likes","comments"], how="any").reset_index(drop=True) if have_all else df.copy()
print(f"✓ After filtering: {len(master):,} rows")

if len(master) < 100:
    print("⚠️  Low match count; check that duration map and META align.")

# ---------------- Quality signals ----------------
print("\n[5/6] Computing quality signals...")

# safe helpers
def nz(a, fill=0.0):
    return pd.to_numeric(a, errors="coerce").fillna(fill)

master["views_log"] = np.log1p(nz(master.get("views", 0)))
# Channel key
if "channel" in master.columns and master["channel"].notna().any():
    ch_key = "channel"
elif "yt_channel" in master.columns and master["yt_channel"].notna().any():
    ch_key = "yt_channel"
else:
    master["_channel"] = "all"
    ch_key = "_channel"

# Week key
if "published_date" in master.columns:
    master["_week"] = pd.to_datetime(master["published_date"], errors="coerce").dt.to_period("W").astype(str)
else:
    master["_week"] = "all"

# engagement rate (Laplace)
k_prior = 4.0
likes_rate    = (nz(master["likes"])    + 1.0) / (nz(master["views"]) + k_prior)
comments_rate = (nz(master["comments"]) + 1.0) / (nz(master["views"]) + k_prior)
master["engagement_rate"] = 0.8 * likes_rate + 0.2 * comments_rate

# rank by week
eng_rank = master.groupby("_week")["engagement_rate"].rank(pct=True)

# channel-standardize
er_mu = eng_rank.groupby(master[ch_key]).transform("mean")
er_sd = eng_rank.groupby(master[ch_key]).transform("std").replace(0, 1)
user_eng = (eng_rank - er_mu) / er_sd
user_eng = user_eng.fillna(0)

# platform quality
ch_mu = master.groupby(ch_key)["views_log"].transform("mean")
ch_sd = master.groupby(ch_key)["views_log"].transform("std").replace(0, 1)
z_ch  = (master["views_log"] - ch_mu) / ch_sd

wk_mu = master.groupby("_week")["views_log"].transform("mean")
res   = master["views_log"] - wk_mu
rw_mu = res.groupby(master[ch_key]).transform("mean")
rw_sd = res.groupby(master[ch_key]).transform("std").replace(0, 1)
plat_qual = (res - rw_mu) / rw_sd
plat_qual = plat_qual.fillna(z_ch)

# blend + winsorize + standardize
qual_final = 0.6 * plat_qual + 0.4 * user_eng
lo, hi = qual_final.quantile([0.01, 0.99])
qual_final = qual_final.clip(lo, hi)

def standardize(x: pd.Series) -> pd.Series:
    m, s = x.mean(), x.std()
    return (x - m) / (s if s > 0 else 1)

master["user_engagement_signal"]  = standardize(user_eng).astype("float32")
master["platform_quality_signal"] = standardize(plat_qual).astype("float32")
master["quality_final"]           = standardize(qual_final).astype("float32")

# clean temp cols
for col in ["_channel", "_week", "views_log", "engagement_rate"]:
    if col in master.columns:
        master.drop(columns=[col], inplace=True)

print("✓ Quality signals computed")
print(f"  Samples: {len(master):,}")
print("  Signals: user_engagement, platform_quality, quality_final")

# ---------------- Save ----------------
print("\n[6/6] Saving master metadata...")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
output_file = OUTPUT_DIR / "META_master_4k.parquet"
master.to_parquet(output_file, index=False)

TARGETS = ["user_engagement_signal", "platform_quality_signal", "quality_final"]
config["master_file"]  = str(output_file)
config["master_rows"]  = int(len(master))
config["targets"]      = TARGETS

with open(OUTPUT_DIR / "config.json", "w") as f:
    json.dump(config, f, indent=2)

print("\n" + "="*70)
print("METADATA COMPLETE")
print("="*70)
print(f"Samples:  {len(master):,}")
print(f"Targets:  {TARGETS}")
print(f"Output:   {output_file}")
print("="*70)

print("\nTarget statistics:")
for t in TARGETS:
    s = master[t]
    print(f"  {t}: mean={s.mean():.3f}  std={s.std():.3f}  min={s.min():.3f}  max={s.max():.3f}")
print("\n✅ Cell 3 Complete")


CELL 3: Build Master with Quality Signals (duration-aware)

[1/6] Loading configuration...

[2/6] Loading feature list...
✓ Loaded 4,000 features

[3/6] Loading META.parquet...
  Found: /content/aivideo-dataset/META.parquet
✓ Loaded 9,578 rows
✓ Found columns: ['views', 'likes', 'comments']

[4/6] Matching features to metadata...
✓ Matched 1,391 / 4,001 (34.8%)
✓ After filtering: 1,376 rows

[5/6] Computing quality signals...
✓ Quality signals computed
  Samples: 1,376
  Signals: user_engagement, platform_quality, quality_final

[6/6] Saving master metadata...

METADATA COMPLETE
Samples:  1,376
Targets:  ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
Output:   /content/models/run_01/META_master_4k.parquet

Target statistics:
  user_engagement_signal: mean=0.000  std=1.000  min=-1.730  max=1.730
  platform_quality_signal: mean=-0.000  std=1.000  min=-3.695  max=1.802
  quality_final: mean=0.000  std=1.000  min=-2.374  max=1.816

✅ Cell 3 Complete


In [None]:
# ============================================================
# CELL 3.5b — Tight tolerance (±0.5s) + IN-PLACE ENRICHMENT
# Re-run duration matching, then FILL missing targets in master.
# Only append rows if the feature_path isn't already in master.
# ============================================================
import pandas as pd
import numpy as np
from pathlib import Path
from urllib.parse import urlparse, parse_qs
import re, glob

FEATURE_DIR = Path("/content/aivideo-dataset/features_logmel_sr16k_v1")
KAGGLE_CSV  = Path("/content/drive/My Drive/STA 160/Spotify Youtube Dataset.csv")
OUTDIR      = Path("/content/models/run_01")
MASTER_PATH = OUTDIR / "META_final_clean.parquet"
TRAIN_OUT   = OUTDIR / "META_training_ready.parquet"

TOLERANCE_SEC = 0.5  # ← tighten window

def extract_video_id(url):
    if pd.isna(url): return None
    try:
        url = str(url)
        if "youtube.com/watch" in url:
            q = parse_qs(urlparse(url).query); return q.get("v", [None])[0]
        if "youtu.be/" in url:
            return url.split("youtu.be/")[-1].split("?")[0]
        m = re.search(r"[?&]v=([^&]+)", url)
        if m: return m.group(1)
    except Exception:
        pass
    return None

def find_col(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    return None

def coerce_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

print("="*70)
print("CELL 3.5b: Tight tolerance (±0.5s) & in-place enrichment")
print("="*70)

# --- Load master
master = pd.read_parquet(MASTER_PATH)
already_paths = set(master.get("feature_path", pd.Series(dtype=str)).astype(str))
print(f"Loaded master: {len(master):,} rows")

# --- Gather sha1 files & their durations
sha1_files = sorted(glob.glob(str(FEATURE_DIR / "sha1_*.npz")))
rows = []
for i, fpath in enumerate(sha1_files):
    try:
        with np.load(fpath, allow_pickle=True) as data:
            dur = float(data["duration_sec"]) if "duration_sec" in data else np.nan
        rows.append({"feature_path": fpath, "npz_basename": Path(fpath).name,
                     "duration_sec_npz": dur})
    except Exception:
        pass
df_npz = pd.DataFrame(rows).dropna(subset=["duration_sec_npz"])
print(f"SHA1 with duration: {len(df_npz):,}")

# --- Load Kaggle & normalize columns
kaggle = pd.read_csv(KAGGLE_CSV)
dur_col  = find_col(kaggle, ["Duration_ms","duration_ms","duration_sec","Duration_sec"])
if dur_col is None:
    raise RuntimeError("Need Duration_ms or duration_sec in Kaggle CSV")
kaggle["duration_sec"] = (kaggle[dur_col] / 1000.0) if dur_col.lower().endswith("_ms") \
                         else pd.to_numeric(kaggle[dur_col], errors="coerce")

url_col = find_col(kaggle, ["Url_youtube","url_youtube","YouTube URL","youtube_url","yt_url","url"])
if url_col is None:
    raise RuntimeError("Need a YouTube URL column in Kaggle CSV (e.g., Url_youtube)")
kaggle["video_id_upper"] = kaggle[url_col].apply(extract_video_id)
kaggle["video_id_upper"] = kaggle["video_id_upper"].astype(str).str.upper()

views_col    = find_col(kaggle, ["Views","views","view_count","View Count","yt_views"])
likes_col    = find_col(kaggle, ["Likes","likes","like_count","Like Count","yt_likes"])
comments_col = find_col(kaggle, ["Comments","comments","comment_count","Comment Count","yt_comments"])
artist_col   = find_col(kaggle, ["Artist","artist","ArtistName","artist_name"])
track_col    = find_col(kaggle, ["Track","track","Title","title","song","Song"])

keep = ["video_id_upper","duration_sec"]
for c in [views_col, likes_col, comments_col, artist_col, track_col]:
    if c: keep.append(c)
kg = kaggle[keep].dropna(subset=["video_id_upper","duration_sec"]).drop_duplicates(subset=["video_id_upper"]).copy()
kg = kg.rename(columns={views_col:"views", likes_col:"likes", comments_col:"comments",
                        artist_col:"Artist", track_col:"Track"} if views_col or likes_col or comments_col else {})
coerce_numeric(kg, ["views","likes","comments"])

# --- Duration match with tight window
npz = df_npz.copy(); npz["bin"] = npz["duration_sec_npz"].round().astype(int)
kg2 = kg.copy();     kg2["bin"] = kg2["duration_sec"].round().astype(int)

cand = npz.merge(kg2, on="bin", how="inner", suffixes=("","_k"))
cand = cand[(cand["duration_sec"].sub(cand["duration_sec_npz"]).abs() <= TOLERANCE_SEC)].copy()
if cand.empty:
    print("No candidates within ±0.5s."); raise SystemExit

cand["abs_diff"] = (cand["duration_sec"] - cand["duration_sec_npz"]).abs()
cand = cand.sort_values(["feature_path","abs_diff"])
best = cand.drop_duplicates(subset=["feature_path"], keep="first").copy()

print(f"Matched: {len(best):,}  (diff<0.5s: {(best['abs_diff'] < 0.5).sum():,})")

# --- Build fill/append frames
fill_cols = ["views","likes","comments","Artist","Track"]
for c in fill_cols:
    if c not in best.columns:
        best[c] = np.nan

best = best.rename(columns={"video_id_upper":"video_id"})
best["video_id"] = best["video_id"].astype(str).str.upper()

# Update existing rows’ missing targets by feature_path
m = master.merge(best[["feature_path","video_id"] + fill_cols],
                 on="feature_path", how="left", suffixes=("","_kag"))
# only fill if original is NA
for tgt in ["views","likes","comments"]:
    if tgt in m.columns and f"{tgt}_kag" in m.columns:
        m[tgt] = m[tgt].fillna(m[f"{tgt}_kag"])

# Fill optional metadata
for opt in ["Artist","Track"]:
    if opt in m.columns and f"{opt}_kag" in m.columns:
        m[opt] = m[opt].fillna(m[f"{opt}_kag"])

# Clean helper cols
drop_helper = [c for c in m.columns if c.endswith("_kag")]
m.drop(columns=drop_helper, inplace=True)

# Append only those matched features not already in master
to_append = best[~best["feature_path"].isin(already_paths)].copy()
append_cols = [c for c in m.columns if c in to_append.columns]
m_final = pd.concat([m, to_append[append_cols]], ignore_index=True)

# Save
m_final.to_parquet(MASTER_PATH, index=False)
has_all = m_final[["views","likes","comments"]].notna().all(axis=1) \
          if all(c in m_final.columns for c in ["views","likes","comments"]) else pd.Series(False, index=m_final.index)
m_final.loc[has_all].to_parquet(TRAIN_OUT, index=False)

print("\n" + "="*70)
print("ENRICHMENT (±0.5s) COMPLETE")
print("="*70)
print(f"Master rows (before): {len(master):,}")
print(f"Master rows (after) : {len(m_final):,}")
print(f"Filled targets now  : {has_all.sum():,} with all three")
print("="*70)


CELL 3.5b: Tight tolerance (±0.5s) & in-place enrichment
Loaded master: 9,578 rows
SHA1 with duration: 5,866
Matched: 5,740  (diff<0.5s: 5,740)

ENRICHMENT (±0.5s) COMPLETE
Master rows (before): 9,578
Master rows (after) : 9,578
Filled targets now  : 8,965 with all three


In [None]:
# ==============================================================================
# HYBRID 3.9 → Union strict (Cell 3) with enriched (3.5b), enforce file-exists,
#               coalesce targets, compute signals, finalize & make train split.
# ==============================================================================

import os, json
from pathlib import Path
import numpy as np
import pandas as pd

OUTDIR = Path("/content/models/run_01")
OUTDIR.mkdir(parents=True, exist_ok=True)

# Inputs (adjust names if yours differ)
STRICT_MASTER    = OUTDIR / "META_master_4k.parquet"        # from Cell 3 (feature-strict)
ENRICHED_MASTER  = OUTDIR / "META_final_clean.parquet"      # from Cell 3.5b (duration/Kaggle)
META_PATH        = Path("/content/aivideo-dataset/META.parquet")

# Outputs
FINAL_MASTER     = OUTDIR / "META_master_postenrich.parquet"
TRAIN_OUT        = OUTDIR / "META_training_ready.parquet"
STATS_OUT        = OUTDIR / "target_statistics.json"

# ---- load config
with open(OUTDIR / "config.json", "r") as f:
    config = json.load(f)

def _load_df(p: Path) -> pd.DataFrame:
    if p.exists():
        df = pd.read_parquet(p)
        # normalize key columns if present
        if "video_id" in df.columns:
            df["video_id"] = df["video_id"].astype(str).str.upper()
        if "feature_path" in df.columns:
            df["feature_path"] = df["feature_path"].astype(str)
        # ensure numeric for targets (if present)
        for c in ["views","likes","comments"]:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        return df
    return pd.DataFrame()

# ---- 1) Load both masters
strict   = _load_df(STRICT_MASTER)
enriched = _load_df(ENRICHED_MASTER)

print("=== HYBRID 3.9 ===")
print(f"strict rows   : {len(strict):,} ({STRICT_MASTER.name})")
print(f"enriched rows : {len(enriched):,} ({ENRICHED_MASTER.name})")

if strict.empty and enriched.empty:
    raise RuntimeError("No inputs to merge. Make sure Cell 3 and/or 3.5b ran.")

# ---- 2) Minimal column set we care about
base_cols = ["feature_path","video_id","views","likes","comments"]
extra_cols = [c for c in ["artist","track","title","channel","yt_channel","published_date"]
              if (c in strict.columns) or (c in enriched.columns)]
want_cols = base_cols + extra_cols

strict   = strict.reindex(columns=sorted(set(strict.columns)   | set(want_cols)))
enriched = enriched.reindex(columns=sorted(set(enriched.columns) | set(want_cols)))

# ---- 3) Union on feature_path, preferring enriched values when present
# concat then drop_duplicates keeps the FIRST; so put enriched first
both = pd.concat([enriched, strict], ignore_index=True, sort=False)

# keep first by feature_path (enriched has precedence)
if "feature_path" not in both.columns:
    raise RuntimeError("Missing 'feature_path' after union.")
both = both.dropna(subset=["feature_path"])
both = both.drop_duplicates(subset=["feature_path"], keep="first").copy()

# coerce types again
if "video_id" in both.columns:
    both["video_id"] = both["video_id"].astype(str).str.upper()
for c in ["views","likes","comments"]:
    if c in both.columns:
        both[c] = pd.to_numeric(both[c], errors="coerce")

print(f"union rows (pre-file-check): {len(both):,}")

# ---- 4) Enforce feature file exists
exists_mask = both["feature_path"].map(os.path.exists)
missing = int((~exists_mask).sum())
if missing > 0:
    print(f"• dropping {missing:,} rows (feature_path not found on disk)")
both = both[exists_mask].copy()
print(f"rows after file-exists check: {len(both):,}")

# ---- 5) Bring descriptive extras from META (left merge; no row adds)
if META_PATH.exists():
    meta = pd.read_parquet(META_PATH)
    if "video_id" not in meta.columns and "id" in meta.columns:
        meta = meta.rename(columns={"id":"video_id"})
    meta["video_id"] = meta["video_id"].astype(str).str.upper()

    keep_extra = [c for c in ["artist","track","title","channel","yt_channel","published_date"]
                  if c in meta.columns]
    if keep_extra:
        meta_extra = meta[["video_id"] + keep_extra].drop_duplicates("video_id")
        merged = both.merge(meta_extra, on="video_id", how="left", suffixes=("","_meta"))
        # coalesce extras
        for c in keep_extra:
            cm = f"{c}_meta"
            if c in merged.columns and cm in merged.columns:
                merged[c] = merged[c].where(merged[c].notna(), merged[cm])
                merged.drop(columns=[cm], inplace=True)
        both = merged

# ---- 6) Compute simple quality signals (optional; doesn’t gate training)
v = pd.to_numeric(both.get("views"), errors="coerce")
l = pd.to_numeric(both.get("likes"), errors="coerce")
c = pd.to_numeric(both.get("comments"), errors="coerce")

views_log = np.log1p(v.fillna(0))
eng_rate  = 0.8 * (l.fillna(0) + 1) / (v.fillna(0) + 4.0) + 0.2 * (c.fillna(0) + 1) / (v.fillna(0) + 4.0)

def zscore(x):
    x = pd.to_numeric(x, errors="coerce")
    finite = x[np.isfinite(x)]
    mu, sd = finite.mean(), finite.std()
    if not np.isfinite(sd) or sd < 1e-6: sd = 1.0
    return (x - mu) / sd

both["platform_quality_signal"] = zscore(views_log).astype("float32")
both["user_engagement_signal"]  = zscore(eng_rate).astype("float32")
both["quality_final"]           = (0.6*both["platform_quality_signal"] +
                                   0.4*both["user_engagement_signal"]).astype("float32")

# ---- 7) Save final master + training subset (require the 3 raw counts)
both.to_parquet(FINAL_MASTER, index=False)
TARGETS = ["views","likes","comments"]
has_all = both[TARGETS].notna().all(axis=1)
train = both.loc[has_all].copy()
train.to_parquet(TRAIN_OUT, index=False)

print("\n=== OUTPUT ===")
print(f"Final master : {FINAL_MASTER}  (rows={len(both):,})")
print(f"Training set : {TRAIN_OUT}      (rows={len(train):,})")

# ---- 8) Stats (counts only)
stats = {}
for t in TARGETS:
    arr = pd.to_numeric(train[t], errors="coerce").to_numpy()
    stats[t] = {
        "count": int(np.isfinite(arr).sum()),
        "mean" : float(np.nanmean(arr)),
        "std"  : float(np.nanstd(arr) or 1.0),
        "min"  : float(np.nanmin(arr)),
        "max"  : float(np.nanmax(arr)),
    }
    print(f"\n{t.upper()}: count={stats[t]['count']:,}  "
          f"mean={stats[t]['mean']:,.2f}  std={stats[t]['std']:,.2f}  "
          f"min={stats[t]['min']:,.2f}  max={stats[t]['max']:,.2f}")

with open(STATS_OUT, "w") as f:
    json.dump(stats, f, indent=2)

# keep config simple and stable
config["master_file"] = str(FINAL_MASTER)
config["training_file"] = str(TRAIN_OUT)
config["targets"] = TARGETS
config["final_samples"] = int(len(train))
with open(OUTDIR / "config.json", "w") as f:
    json.dump(config, f, indent=2)

print("\n✅ Hybrid 3.9 complete.")


=== HYBRID 3.9 ===
strict rows   : 1,376 (META_master_4k.parquet)
enriched rows : 9,578 (META_final_clean.parquet)
union rows (pre-file-check): 9,565
rows after file-exists check: 9,565

=== OUTPUT ===
Final master : /content/models/run_01/META_master_postenrich.parquet  (rows=9,565)
Training set : /content/models/run_01/META_training_ready.parquet      (rows=8,965)

VIEWS: count=8,965  mean=94,252,260.65  std=275,194,939.90  min=26.00  max=5,773,797,147.00

LIKES: count=8,965  mean=689,817.86  std=1,767,627.86  min=0.00  max=40,147,618.00

COMMENTS: count=8,965  mean=28,585.15  std=126,413.90  min=0.00  max=5,331,537.00

✅ Hybrid 3.9 complete.


In [None]:
# ==============================================================================
# CELL 4: Verify Data Quality (1 minute)
# ==============================================================================

print("="*70)
print("CELL 4: Verify Data Quality")
print("="*70)

# Load config and master
with open(OUTPUT_DIR / "config.json", 'r') as f:
    config = json.load(f)

master = pd.read_parquet(config['master_file'])
TARGETS = config['targets']

print(f"✓ Loaded {len(master):,} samples")

# Check missing values
master = master.dropna(subset=TARGETS, how='any').reset_index(drop=True)
print(f"✓ Samples with all targets: {len(master):,}")

# Compute target statistics
print("\nTarget Statistics:")
print("="*70)

target_stats = {}
for target in TARGETS:
    values = master[target].values
    stats = {
        'mean': float(np.mean(values)),
        'std': float(np.std(values)),
        'min': float(np.min(values)),
        'max': float(np.max(values))
    }
    if stats['std'] < 1e-8:
        stats['std'] = 1.0

    target_stats[target] = stats

    print(f"\n{target.upper()}:")
    print(f"  Mean: {stats['mean']:,.2f}")
    print(f"  Std:  {stats['std']:,.2f}")
    print(f"  Min:  {stats['min']:,.2f}")
    print(f"  Max:  {stats['max']:,.2f}")

# Save statistics
stats_file = OUTPUT_DIR / "target_statistics.json"
with open(stats_file, 'w') as f:
    json.dump(target_stats, f, indent=2)

config['target_stats'] = target_stats
config['final_samples'] = len(master)
with open(OUTPUT_DIR / "config.json", 'w') as f:
    json.dump(config, f, indent=2)

print(f"\n✅ Cell 4 Complete")
print(f"Statistics saved: {stats_file}")




CELL 4: Verify Data Quality
✓ Loaded 9,565 samples
✓ Samples with all targets: 8,965

Target Statistics:

VIEWS:
  Mean: 94,252,260.65
  Std:  275,194,939.90
  Min:  26.00
  Max:  5,773,797,147.00

LIKES:
  Mean: 689,817.86
  Std:  1,767,627.86
  Min:  0.00
  Max:  40,147,618.00

COMMENTS:
  Mean: 28,585.15
  Std:  126,413.90
  Min:  0.00
  Max:  5,331,537.00

✅ Cell 4 Complete
Statistics saved: /content/models/run_01/target_statistics.json


In [None]:
# ==============================================================================
# CELL 4.1 — Strict training filter + reject report
# Ensures: no NaNs, finite targets, non-negative, file exists, sane ratios, de-duped
# Outputs:
#   /content/models/run_01/META_training_clean.parquet
#   /content/models/run_01/META_training_rejects.csv
# ==============================================================================

import os, json, math
import numpy as np
import pandas as pd
from pathlib import Path

OUTDIR = Path("/content/models/run_01")
CONF = OUTDIR / "config.json"

# --- Load master from config (fallback to postenrich/master you used) ---
with open(CONF, "r") as f:
    cfg = json.load(f)

master_path = Path(cfg.get("master_file", OUTDIR / "META_training_ready.parquet"))
if not master_path.exists():
    raise FileNotFoundError(f"Master not found: {master_path}")

TARGETS = cfg.get("targets", ["views","likes","comments"])
df = pd.read_parquet(master_path).copy()

# Normalize columns
df["video_id"] = df["video_id"].astype(str).str.upper()
for c in TARGETS:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Some pipelines store the feature path under different names — normalize that too
feat_col = None
for cand in ["feature_path","feature","path","feat_path"]:
    if cand in df.columns:
        feat_col = cand
        break
if feat_col is None:
    raise RuntimeError("No feature path column found (expected one of: feature_path, feature, path, feat_path).")

# --- Helper flags ---
def is_finite(x):
    return np.isfinite(x)

def safe_ratio(num, den):
    num = float(num) if np.isfinite(num) else np.nan
    den = float(den) if np.isfinite(den) else np.nan
    if not np.isfinite(num) or not np.isfinite(den) or den <= 0:
        return np.nan
    return num/den

# --- Build rejection reasons per row ---
reasons = []

# Thresholds (conservative defaults; tweak if you like)
MIN_VIEWS = 1            # require at least 1 view
MAX_LIKE_RATE = 1.0      # likes <= views (per-sample)
MAX_COMM_RATE = 2.0      # comments <= 2x views (very lax; protects data entry issues)

exists_mask = df[feat_col].map(lambda p: Path(str(p)).exists())
reasons.append(np.where(~exists_mask, "missing_feature_file", ""))

# Missing or non-finite targets
for t in TARGETS:
    miss = ~df[t].notna()
    nonfin = ~df[t].map(is_finite)
    neg = df[t] < 0
    reasons.append(np.where(miss, f"{t}_missing", ""))
    reasons.append(np.where(nonfin, f"{t}_nonfinite", ""))
    reasons.append(np.where(neg, f"{t}_negative", ""))

# Basic sanity constraints
views = df["views"]
likes = df["likes"]
comments = df["comments"]

low_views = views.fillna(-1) < MIN_VIEWS
reasons.append(np.where(low_views, "views_below_min", ""))

like_rate = [safe_ratio(l, v) for l, v in zip(likes, views)]
comm_rate = [safe_ratio(c, v) for c, v in zip(comments, views)]

reasons.append(np.where(pd.Series(like_rate) > MAX_LIKE_RATE, "likes>views", ""))
reasons.append(np.where(pd.Series(comm_rate) > MAX_COMM_RATE, "comments>2xviews", ""))

# Aggregate reasons
reason_df = pd.DataFrame({f"r{i}": r for i, r in enumerate(reasons)})
df["reject_reasons"] = reason_df.apply(lambda row: ",".join([x for x in row if x]), axis=1)
df["reject_reasons"] = df["reject_reasons"].str.strip(",")

# Keep rows with NO reasons
ok_mask = df["reject_reasons"] == ""
df_ok = df[ok_mask].copy()
df_bad = df[~ok_mask].copy()

# --- Deduplicate by video_id (keep the most complete & largest views, file exists) ---
def completeness_score(row):
    score = 0
    for t in TARGETS:
        score += int(pd.notna(row[t]) and np.isfinite(row[t]))
    score += int(Path(str(row[feat_col])).exists())
    # prefer larger views if ties
    v = row["views"]
    try:
        score = (score, float(v) if np.isfinite(v) else -1.0)
    except Exception:
        score = (score, -1.0)
    return score

df_ok["_score"] = df_ok.apply(completeness_score, axis=1)
df_ok = df_ok.sort_values(["video_id","_score"], ascending=[True, False])
df_ok = df_ok[~df_ok.duplicated(subset=["video_id"], keep="first")].drop(columns=["_score"])

# --- Final training set ---
train = df_ok.copy()

# --- Prints & saves ---
print("\n" + "="*70)
print("TRAINING FILTER SUMMARY")
print("="*70)
print(f"Master rows:           {len(df):,}")
print(f"Feature file exists:   {exists_mask.sum():,}/{len(df):,}")
print(f"Rejected rows total:   {len(df_bad):,}")
if len(df_bad):
    print("Top rejection reasons:")
    print(df_bad["reject_reasons"].str.get_dummies(sep=",").sum().sort_values(ascending=False).head(10))

print(f"\nKept (pre-dedupe):     {ok_mask.sum():,}")
print(f"Kept (post-dedupe):    {len(train):,}")

# Target completeness in final set
all_targets_ok = train[TARGETS].notna().all(axis=1) & train[TARGETS].applymap(np.isfinite).all(axis=1)
print(f"Rows with ALL targets: {all_targets_ok.sum():,}/{len(train):,}")

# Basic target stats on final set
print("\nTarget stats on final training set:")
for t in TARGETS:
    x = pd.to_numeric(train[t], errors="coerce")
    print(f"  {t}: count={x.notna().sum():,}, min={x.min():.0f}, p50={x.quantile(0.5):.0f}, p90={x.quantile(0.9):.0f}, max={x.max():.0f}")

# Save artifacts
clean_path   = OUTDIR / "META_training_clean.parquet"
rejects_path = OUTDIR / "META_training_rejects.csv"
train.to_parquet(clean_path, index=False)
df_bad.to_csv(rejects_path, index=False)

# Update config pointers
cfg["master_file_clean"] = str(clean_path)
cfg["final_samples_clean"] = int(len(train))
with open(CONF, "w") as f:
    json.dump(cfg, f, indent=2)

print("\n✅ Wrote:")
print(f"  Clean training set : {clean_path}")
print(f"  Reject report      : {rejects_path}")



TRAINING FILTER SUMMARY
Master rows:           9,565
Feature file exists:   9,565/9,565
Rejected rows total:   600
Top rejection reasons:
comments_missing      583
comments_nonfinite    583
likes_missing         529
likes_nonfinite       529
views_below_min       462
views_missing         462
views_nonfinite       462
dtype: int64

Kept (pre-dedupe):     8,965
Kept (post-dedupe):    8,965
Rows with ALL targets: 8,965/8,965

Target stats on final training set:
  views: count=8,965, min=26, p50=3635478, p90=281131541, max=5773797147
  likes: count=8,965, min=0, p50=36177, p90=2188757, max=40147618
  comments: count=8,965, min=0, p50=733, p90=67730, max=5331537

✅ Wrote:
  Clean training set : /content/models/run_01/META_training_clean.parquet
  Reject report      : /content/models/run_01/META_training_rejects.csv


  all_targets_ok = train[TARGETS].notna().all(axis=1) & train[TARGETS].applymap(np.isfinite).all(axis=1)


In [None]:
# ==============================================================================
# CELL 5: Define Model (1 minute)
# ==============================================================================

import torch
import torch.nn as nn

print("="*70)
print("CELL 5: Model Definition")
print("="*70)

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU(inplace=True)

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class AudioBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.stem = ConvBlock(1, 32, (3,7), (1,3))
        self.block1 = ConvBlock(32, 64, (3,5), (1,2))
        self.block2 = ConvBlock(64, 128, (3,5), (1,2))
        self.block3 = ConvBlock(128, 192, (3,3), (1,1))
        self.pool = nn.AdaptiveAvgPool2d((1,1))

    def forward(self, x):
        x = self.stem(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        return self.pool(x).flatten(1)

class MultiTaskModel(nn.Module):
    def __init__(self, n_targets, embedding_dim=192):
        super().__init__()
        self.n_targets = n_targets
        self.backbone = AudioBackbone()
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(embedding_dim, 128),
                nn.ReLU(inplace=True),
                nn.Dropout(0.3),
                nn.Linear(128, 1)
                # NO Sigmoid - regression not classification!
            )
            for _ in range(n_targets)
        ])

    def forward(self, x):
        features = self.backbone(x)
        outputs = [head(features).squeeze(1) for head in self.heads]
        return torch.stack(outputs, dim=1)

# Test model
test_model = MultiTaskModel(n_targets=3)
total_params = sum(p.numel() for p in test_model.parameters())
print(f"✓ Model defined")
print(f"  Parameters: {total_params:,}")
print(f"  ✓ No Sigmoid in output!")

print(f"\n✅ Cell 5 Complete")




CELL 5: Model Definition
✓ Model defined
  Parameters: 451,203
  ✓ No Sigmoid in output!

✅ Cell 5 Complete


In [None]:
# CELL 5: Model Definition
import torch
import torch.nn as nn

print("="*70)
print("CELL 5: Model Definition")
print("="*70)

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU(inplace=True)

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class AudioBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.stem = ConvBlock(1, 32, (3,7), (1,3))
        self.block1 = ConvBlock(32, 64, (3,5), (1,2))
        self.block2 = ConvBlock(64, 128, (3,5), (1,2))
        self.block3 = ConvBlock(128, 192, (3,3), (1,1))
        self.pool = nn.AdaptiveAvgPool2d((1,1))

    def forward(self, x):
        x = self.stem(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        return self.pool(x).flatten(1)

class MultiTaskModel(nn.Module):
    def __init__(self, n_targets, embedding_dim=192):
        super().__init__()
        self.n_targets = n_targets
        self.backbone = AudioBackbone()
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(embedding_dim, 128),
                nn.ReLU(inplace=True),
                nn.Dropout(0.3),
                nn.Linear(128, 1)
                # NO Sigmoid - regression not classification!
            )
            for _ in range(n_targets)
        ])

    def forward(self, x):
        features = self.backbone(x)
        outputs = [head(features).squeeze(1) for head in self.heads]
        return torch.stack(outputs, dim=1)

# Test model
test_model = MultiTaskModel(n_targets=3)
total_params = sum(p.numel() for p in test_model.parameters())
print(f"✓ Model defined")
print(f"  Parameters: {total_params:,}")
print(f"  ✓ No Sigmoid in output!")

print(f"\n✅ Cell 5 Complete")

CELL 5: Model Definition
✓ Model defined
  Parameters: 451,203
  ✓ No Sigmoid in output!

✅ Cell 5 Complete


In [None]:
# Check what columns you have
import pandas as pd
master = pd.read_parquet("/content/models/run_01/META_master_4k.parquet")
print(master.columns.tolist())

['feature_path', 'basename', 'filename', 'stem', 'video_id', 'video_id_upper', 'views', 'likes', 'comments', 'channel', 'published_date', 'user_engagement_signal', 'platform_quality_signal', 'quality_final']


In [None]:
# ================================================================
# Alternative: Use Raw Log-Transformed Targets (Most Interpretable!)
# ================================================================
import numpy as np
import pandas as pd
from pathlib import Path

print("="*70)
print("CREATING LOG-TRANSFORMED TARGETS (Alternative Approach)")
print("="*70)

# ---------- Paths ----------
MASTER_FILE = Path("/content/models/run_01/META_training_ready.parquet")
OUTPUT_FILE = Path("/content/drive/MyDrive/STA 160/models/run_01/META_log_targets.parquet")

print(f"Input:  {MASTER_FILE}")
print(f"Output: {OUTPUT_FILE}")

# ---------- Load data ----------
df = pd.read_parquet(MASTER_FILE)
print(f"\n✓ Loaded {len(df):,} rows")

# ---------- Create log-transformed targets ----------
print("\n" + "="*70)
print("CREATING LOG TARGETS:")
print("="*70)

# Ensure we have the raw metrics
for col in ['views', 'likes', 'comments']:
    if col not in df.columns:
        print(f"⚠️ Missing column: {col}")
        continue

    # Log transform (handles zeros)
    log_col = f'{col}_log'
    df[log_col] = np.log1p(df[col].fillna(0))

    print(f"\n{col} → {log_col}:")
    print(f"  Original range: [{df[col].min():.0f}, {df[col].max():.0f}]")
    print(f"  Log range:      [{df[log_col].min():.2f}, {df[log_col].max():.2f}]")
    print(f"  Log mean:       {df[log_col].mean():.2f}")
    print(f"  Log std:        {df[log_col].std():.2f}")

# ---------- Statistics ----------
print("\n" + "="*70)
print("LOG TARGET STATISTICS:")
print("="*70)

for col in ['views_log', 'likes_log', 'comments_log']:
    if col in df.columns:
        s = df[col]
        print(f"\n{col}:")
        print(f"  mean: {s.mean():.3f}")
        print(f"  std:  {s.std():.3f}")
        print(f"  min:  {s.min():.3f}")
        print(f"  25%:  {s.quantile(0.25):.3f}")
        print(f"  50%:  {s.quantile(0.50):.3f}")
        print(f"  75%:  {s.quantile(0.75):.3f}")
        print(f"  max:  {s.max():.3f}")

# ---------- Examples ----------
print("\n" + "="*70)
print("EXAMPLE CONVERSIONS (Log → Real):")
print("="*70)

examples = [
    ("Low popularity", 10),
    ("Below average", 12),
    ("Average", 14),
    ("Above average", 16),
    ("Popular", 18),
    ("Very popular", 20),
    ("Viral hit", 22),
]

print("\nviews_log → actual views:")
for label, log_val in examples:
    actual = np.expm1(log_val)
    if actual >= 1e6:
        display = f"{actual/1e6:.1f}M"
    elif actual >= 1e3:
        display = f"{actual/1e3:.0f}K"
    else:
        display = f"{actual:.0f}"
    print(f"  {log_val:2.0f} → {display:>8s}  ({label})")

# ---------- Save ----------
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(OUTPUT_FILE, index=False)

print(f"\n✓ Saved log-target data: {OUTPUT_FILE}")

# ---------- Update config ----------
import json
config_path = OUTPUT_FILE.parent / "config.json"
if config_path.exists():
    with open(config_path, 'r') as f:
        config = json.load(f)
else:
    config = {}

config['master_file'] = str(OUTPUT_FILE)
config['use_log_targets'] = True
config['log_target_stats'] = {
    col: {
        'mean': float(df[col].mean()),
        'std': float(df[col].std()),
        'min': float(df[col].min()),
        'max': float(df[col].max())
    }
    for col in ['views_log', 'likes_log', 'comments_log'] if col in df.columns
}

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"✓ Updated config.json")

print("\n" + "="*70)
print("NEXT STEPS:")
print("="*70)
print("1. Update Cell 6:")
print("   a. Change master_file path")
print("   b. Change TARGETS = ['views_log', 'likes_log', 'comments_log']")
print("2. Retrain for 30-50 epochs")
print("3. Predictions will be in log scale (interpretable!)")
print("4. Convert back: actual_views = np.expm1(prediction)")
print("="*70)

print("\n" + "="*70)
print("ADVANTAGES OF LOG TARGETS:")
print("="*70)
print("✓ Full dynamic range preserved (no compression)")
print("✓ Directly interpretable (can convert to real numbers)")
print("✓ No z-score confusion")
print("✓ More robust to outliers (log dampens extremes)")
print("✓ Industry standard for popularity prediction")
print("="*70)

CREATING LOG-TRANSFORMED TARGETS (Alternative Approach)
Input:  /content/models/run_01/META_training_ready.parquet
Output: /content/drive/MyDrive/STA 160/models/run_01/META_log_targets.parquet

✓ Loaded 8,965 rows

CREATING LOG TARGETS:

views → views_log:
  Original range: [26, 5773797147]
  Log range:      [3.30, 22.48]
  Log mean:       14.87
  Log std:        3.58

likes → likes_log:
  Original range: [0, 40147618]
  Log range:      [0.00, 17.51]
  Log mean:       10.35
  Log std:        3.29

comments → comments_log:
  Original range: [0, 5331537]
  Log range:      [0.00, 15.49]
  Log mean:       6.40
  Log std:        3.66

LOG TARGET STATISTICS:

views_log:
  mean: 14.866
  std:  3.578
  min:  3.296
  25%:  12.328
  50%:  15.106
  75%:  17.626
  max:  22.477

likes_log:
  mean: 10.348
  std:  3.291
  min:  0.000
  25%:  8.123
  50%:  10.496
  75%:  12.965
  max:  17.508

comments_log:
  mean: 6.400
  std:  3.660
  min:  0.000
  25%:  3.807
  50%:  6.599
  75%:  9.343
  max:  15.

In [None]:
# ================================================================
# CELL 6 — Fast, Robust Trainer (Google Drive Autosave + Improved Architecture)
# ================================================================
import os, time, json, random, numpy as np, pandas as pd
from pathlib import Path
from typing import Optional, Tuple, List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast

# ---------- Paths / Config (NOW SAVES TO GOOGLE DRIVE!) ----------
OUTDIR = Path("/content/drive/MyDrive/STA 160/models/run_01")
OUTDIR.mkdir(parents=True, exist_ok=True)

config_path = OUTDIR / "config.json"
if config_path.exists():
    with open(config_path, "r") as f:
        config = json.load(f)
else:
    config = {}

master_file = Path(config.get("master_file", OUTDIR / "META_log_targets.parquet"))
# Fallback search for master file
if not master_file.exists():
    for candidate in [
        OUTDIR / "META_training_ready.parquet",
        OUTDIR / "META_master_postenrich.parquet",
        Path("/content/models/run_01/META_training_ready.parquet"),
        Path("/content/models/run_01/META_master_postenrich.parquet"),
    ]:
        if candidate.exists():
            master_file = candidate
            break

print("="*70)
print("CELL 6 — Fast, Robust Trainer (Google Drive Autosave)")
print("="*70)
print(f"✓ Output directory: {OUTDIR}")
print(f"✓ Using master: {master_file}")

# ---------- Hyperparams ----------
MEL_BINS  = 128
FRAMES    = 768
VAL_SPLIT = 0.15
BATCH_SIZE = 48
EPOCHS     = 30  # Increased from 30
LR, WD     = 1e-3, 1e-4
MAX_GRAD_NORM = 1.0
PATIENCE   = 10  # Early stopping patience
SEED = 2025

USE_CUDA = torch.cuda.is_available()
DEVICE   = torch.device("cuda" if USE_CUDA else "cpu")
AMP_EN   = USE_CUDA

if USE_CUDA:
    torch.backends.cudnn.benchmark = True
    torch.set_float32_matmul_precision("high")

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if USE_CUDA: torch.cuda.manual_seed_all(SEED)

# ---------- Load data ----------
df = pd.read_parquet(master_file)
print(f"✓ Loaded {len(df):,} master rows")

# prefer signals, else counts
CAND_SETS = [
    ["views_log", "likes_log", "comments_log"],
    ["views","likes","comments"],
]
for cand in CAND_SETS:
    if all(c in df.columns for c in cand):
        TARGETS = cand; break
else:
    raise RuntimeError("No valid target set in master file.")
print(f"✓ Targets: {TARGETS}")

idx = np.arange(len(df)); np.random.shuffle(idx)
cut = int(len(idx)*(1-VAL_SPLIT))
df_train, df_val = df.iloc[idx[:cut]].reset_index(drop=True), df.iloc[idx[cut:]].reset_index(drop=True)
print(f"✓ Train: {len(df_train):,} | Val: {len(df_val):,}")

target_stats = {t: {"mean": float(pd.to_numeric(df[t], errors="coerce").mean()),
                    "std" : float(pd.to_numeric(df[t], errors="coerce").std() or 1.0)}
                for t in TARGETS}

# ---------- Feature loader ----------
def _finite(arr): return np.isfinite(arr).all()

def load_feature(fp: str, frames: int = FRAMES, center: bool = False) -> Optional[np.ndarray]:
    try:
        with np.load(fp, allow_pickle=False, mmap_mode="r" if USE_CUDA else None) as z:
            key = next((k for k in ("logmel","log_mel","mel","features","x","S") if k in z.files), None)
            if key is None:
                return None
            x = z[key]

        # Squeeze and orient to [F, T]
        if x.ndim == 3:
            x = x.squeeze()
        if x.ndim != 2:
            return None
        if x.shape[0] != MEL_BINS and x.shape[1] == MEL_BINS:
            x = x.T
        if x.shape[0] != MEL_BINS:
            return None

        # Force a safe float dtype before any magnitude checks
        if x.dtype != np.float32:
            x = x.astype(np.float32, copy=False)

        # Fast finite check first
        if not np.isfinite(x).all():
            return None

        # Suppress noisy overflow warnings during the magnitude guard
        with np.errstate(over="ignore", invalid="ignore"):
            mx = np.nanmax(np.abs(x))
        if not np.isfinite(mx) or mx > 1e6:
            return None

        # Center/Random crop or pad to FRAMES
        T = x.shape[1]
        if T >= frames:
            start = (T - frames)//2 if center else np.random.randint(0, T - frames + 1)
            x = x[:, start:start+frames]
        else:
            pad = frames - T
            x = np.pad(x, ((0,0), (pad//2, pad - pad//2)), mode="constant")

        # Robust winsorize + standardize
        lo, hi = np.percentile(x, [0.5, 99.5]).astype(np.float32)
        x = np.clip(x, lo, hi)

        m = float(np.mean(x, dtype=np.float64))
        s = float(np.std(x,  dtype=np.float64))
        if not np.isfinite(s) or s < 1e-6:
            return None
        x = (x - m) / s
        x = np.clip(x, -10, 10)

        if not np.isfinite(x).all():
            return None

        # [C=1, F, T]
        return x.astype(np.float32, copy=False)[None, ...]

    except Exception:
        return None


class FastAudioDS(Dataset):
    def __init__(self, df: pd.DataFrame, targets: List[str], center: bool = False):
        self.df = df.reset_index(drop=True); self.targets = targets; self.center = center
    def __len__(self): return len(self.df)
    def __getitem__(self, i: int) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
        r = self.df.iloc[i]
        x = load_feature(r["feature_path"], center=self.center)
        if x is None: return None
        y = np.array([float(r[t]) for t in self.targets], dtype=np.float32)
        if not np.isfinite(y).all(): return None
        return torch.from_numpy(x), torch.from_numpy(y)

def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    if not batch: return None
    X, y = zip(*batch); return torch.stack(X,0), torch.stack(y,0)

# ---------- DataLoader factory ----------
if not USE_CUDA:
    BATCH_SIZE = min(BATCH_SIZE, 16)
NUM_WORKERS = max(2, os.cpu_count()//2) if USE_CUDA else 0
PIN = USE_CUDA

def make_loader(dataset, shuffle, drop_last, workers=NUM_WORKERS):
    kwargs = dict(batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=workers,
                  pin_memory=PIN, collate_fn=collate_fn, drop_last=drop_last)
    if workers > 0:
        kwargs.update(dict(persistent_workers=True, prefetch_factor=4))
    return DataLoader(dataset, **kwargs)

print("\nCreating train/val datasets (fast & robust)…")
train_ds = FastAudioDS(df_train, TARGETS, center=False)
val_ds   = FastAudioDS(df_val,   TARGETS, center=True)

train_loader = make_loader(train_ds, shuffle=True,  drop_last=True)
val_loader   = make_loader(val_ds,   shuffle=False, drop_last=False)
print(f"✓ Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

# ---------- IMPROVED Model Architecture (Fixed + Deeper) ----------
class ConvBlock(nn.Module):
    def __init__(self, cin, cout, k=(3,7), s=(1,1), p=None):
        super().__init__()
        p = p or (k[0]//2, k[1]//2)
        self.conv = nn.Conv2d(cin, cout, k, s, p)
        self.bn = nn.BatchNorm2d(cout)
        self.act = nn.SiLU()
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class ResidualBlock(nn.Module):
    """Simple residual connection for same-channel blocks"""
    def __init__(self, channels, k=(3,3)):
        super().__init__()
        p = (k[0]//2, k[1]//2)
        self.conv1 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn1 = nn.BatchNorm2d(channels)
        self.act1 = nn.SiLU()
        self.conv2 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn2 = nn.BatchNorm2d(channels)
        self.act2 = nn.SiLU()

    def forward(self, x):
        identity = x
        out = self.act1(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + identity  # Residual connection
        return self.act2(out)

class MultiTaskModel(nn.Module):
    """
    FIXED Architecture - Now properly uses all 5 channel sizes with residual connections

    Architecture:
    - stem: 1 -> 32 (initial feature extraction)
    - b1: 32 -> 64 + residual (low-level features)
    - b2: 64 -> 128 + residual (mid-level features) + maxpool
    - b3: 128 -> 256 + residual (high-level features) + maxpool
    - b4: 256 -> 512 + residual (abstract features)
    - head: 512 -> 256 -> n_targets (prediction)

    Total depth: 5 stages (vs original 4)
    Residual connections help gradient flow for deeper network
    """
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 256, 512]  # Now ALL channels are used!

        # Initial feature extraction
        self.stem = ConvBlock(1, C[0], (3,7))
        self.res_stem = ResidualBlock(C[0], (3,3))

        # Hierarchical feature learning
        self.b1 = ConvBlock(C[0], C[1], (3,5))
        self.res1 = ResidualBlock(C[1], (3,3))

        self.b2 = ConvBlock(C[1], C[2], (3,5))
        self.res2 = ResidualBlock(C[2], (3,3))

        self.b3 = ConvBlock(C[2], C[3], (3,3))
        self.res3 = ResidualBlock(C[3], (3,3))

        self.b4 = ConvBlock(C[3], C[4], (3,3))  # NEW: Now using C[4]=512
        self.res4 = ResidualBlock(C[4], (3,3))

        # Global pooling
        self.pool = nn.AdaptiveAvgPool2d((1,1))

        # Prediction head (wider for more capacity)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(C[4], 256),
            nn.SiLU(),
            nn.Dropout(0.3),  # Increased dropout for regularization
            nn.Linear(256, n_targets)
        )

    def forward(self, x):
        # Stage 1: Initial features
        x = self.stem(x)
        x = self.res_stem(x)

        # Stage 2: Low-level features
        x = self.b1(x)
        x = self.res1(x)

        # Stage 3: Mid-level features + downsample
        x = self.b2(x)
        x = self.res2(x)
        x = nn.functional.max_pool2d(x, 2)

        # Stage 4: High-level features + downsample
        x = self.b3(x)
        x = self.res3(x)
        x = nn.functional.max_pool2d(x, 2)

        # Stage 5: Abstract features (NEW)
        x = self.b4(x)
        x = self.res4(x)

        # Global pooling + prediction
        x = self.pool(x)
        return self.head(x)

model = MultiTaskModel(n_targets=len(TARGETS)).to(DEVICE)
if USE_CUDA:
    model = model.to(memory_format=torch.channels_last)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✓ Model created: {trainable_params:,} trainable parameters (total: {total_params:,})")

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=5)
scaler = GradScaler(device="cuda") if AMP_EN else None

def to_dev(X, y):
    if USE_CUDA:
        X = X.to(DEVICE, non_blocking=True).to(memory_format=torch.channels_last)
        y = y.to(DEVICE, non_blocking=True)
    else:
        X, y = X.to(DEVICE), y.to(DEVICE)
    return X, y

@torch.no_grad()
def evaluate():
    model.eval(); losses = []
    for batch in val_loader:
        if batch is None: continue
        X, y = to_dev(*batch)
        with autocast(device_type="cuda", enabled=AMP_EN):
            pred = model(X); loss = criterion(pred, y)
        if torch.isfinite(loss): losses.append(loss.item())
    return float(np.mean(losses)) if losses else float("inf")

# ---------- Train with Early Stopping (auto-save to Google Drive) ----------
print("\nTraining…")
print("="*70)
print(f"Output dir: {OUTDIR}")
print(f"Patience: {PATIENCE} epochs (early stopping)")
print("="*70)

best_val, history = float("inf"), []
patience_counter = 0
ckpt_path = OUTDIR / "best_model_robust.pt"

for epoch in range(1, EPOCHS+1):
    t0 = time.time(); model.train(); batch_losses = []

    for batch in make_loader(train_ds, shuffle=True, drop_last=True):
        if batch is None: continue
        X, y = to_dev(*batch)
        optimizer.zero_grad(set_to_none=True)

        if AMP_EN:
            with autocast(device_type="cuda", enabled=True):
                pred = model(X); loss = criterion(pred, y)
            if not torch.isfinite(loss): continue
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer); scaler.update()
        else:
            pred = model(X); loss = criterion(pred, y)
            if not torch.isfinite(loss): continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()

        batch_losses.append(loss.item())

    train_loss = float(np.mean(batch_losses)) if batch_losses else float("inf")
    val_loss = evaluate()
    scheduler.step(val_loss)

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch:02d}/{EPOCHS} | Train {train_loss:.4f} | Val {val_loss:.4f} | "
          f"LR {current_lr:.1e} | {time.time()-t0:.0f}s")

    history.append({
        "epoch": epoch,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "lr": current_lr
    })

    # Save to Google Drive if new best
    if np.isfinite(val_loss) and val_loss < best_val:
        best_val = val_loss
        patience_counter = 0

        state = {k: v.detach().cpu() for k,v in model.state_dict().items()}
        torch.save({
            "model_state_dict": state,
            "targets": TARGETS,
            "target_stats": target_stats,
            "best_val_loss": best_val,
            "epoch": epoch,
            "config": {
                "mel_bins": MEL_BINS,
                "frames": FRAMES,
                "batch_size": BATCH_SIZE,
                "lr": LR,
                "weight_decay": WD,
                "architecture": "MultiTaskModel_v2_with_residuals"
            }
        }, ckpt_path)

        print(f"  ✓ New best {best_val:.4f} — saved to Google Drive: {ckpt_path}")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"\n⚠️ Early stopping triggered after {epoch} epochs (no improvement for {PATIENCE} epochs)")
            break

# Save history to Google Drive
history_path = OUTDIR / "training_history_robust.csv"
pd.DataFrame(history).to_csv(history_path, index=False)
print(f"✓ Training history saved: {history_path}")

# Update config in Google Drive
config.update({
    "best_val_loss": best_val,
    "targets": TARGETS,
    "target_stats": target_stats,
    "checkpoint_path": str(ckpt_path),
    "training_complete": True,
    "total_epochs": len(history),
    "architecture": "MultiTaskModel_v2_with_residuals"
})
with open(OUTDIR / "config.json", "w") as f:
    json.dump(config, f, indent=2)
print(f"✓ Config updated: {OUTDIR / 'config.json'}")

print("\n" + "="*70)
print("✅ Training complete!")
print("="*70)
print(f"Best val loss: {best_val:.4f}")
print(f"Total epochs: {len(history)}")
print(f"Best model saved to Google Drive: {ckpt_path}")
print("="*70)

CELL 6 — Fast, Robust Trainer (Google Drive Autosave)
✓ Output directory: /content/drive/MyDrive/STA 160/models/run_01
✓ Using master: /content/drive/MyDrive/STA 160/models/run_01/META_log_targets.parquet
✓ Loaded 8,965 master rows
✓ Targets: ['views_log', 'likes_log', 'comments_log']
✓ Train: 7,620 | Val: 1,345

Creating train/val datasets (fast & robust)…
✓ Train batches: 158 | Val batches: 29
✓ Model created: 8,055,171 trainable parameters (total: 8,055,171)

Training…
Output dir: /content/drive/MyDrive/STA 160/models/run_01
Patience: 10 epochs (early stopping)
Epoch 01/30 | Train 20.9186 | Val 20.9258 | LR 1.0e-03 | 109s
  ✓ New best 20.9258 — saved to Google Drive: /content/drive/MyDrive/STA 160/models/run_01/best_model_robust.pt
Epoch 02/30 | Train 13.1405 | Val 13.2261 | LR 1.0e-03 | 123s
  ✓ New best 13.2261 — saved to Google Drive: /content/drive/MyDrive/STA 160/models/run_01/best_model_robust.pt
Epoch 03/30 | Train 12.9762 | Val 12.6015 | LR 1.0e-03 | 110s
  ✓ New best 12.601

In [None]:
# ======================================================================
# CELL 8.5 — Model loader (Updated for improved architecture with residuals)
# ======================================================================
from pathlib import Path
import torch, torch.nn as nn
import numpy as np

# --------------------------- discover test audio ---------------------------
DRIVE_FOLDER = Path("/content/drive/MyDrive/STA 160/test")
candidates = sorted(
    [*DRIVE_FOLDER.glob("*.mp3"), *DRIVE_FOLDER.glob("*.wav"),
     *DRIVE_FOLDER.glob("*.m4a"), *DRIVE_FOLDER.glob("*.flac"),
     *DRIVE_FOLDER.glob("*.ogg")]
)
test_files = [p for p in candidates if not p.name.endswith(".crdownload") and p.stat().st_size > 0]
song_names = [p.stem[:80] for p in test_files]

print("Testing these files:")
for i, (p, name) in enumerate(zip(test_files, song_names), 1):
    print(f"  {i}. ✓ {name} ({p.suffix.lower()} {p.stat().st_size/1_048_576:.1f} MB)")

assert len(test_files) >= 2, (
    "Need at least 2 finalized audio files in /STA 160/test folder."
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Updated checkpoint path (now in Google Drive)
ckpt_path = Path("/content/drive/MyDrive/STA 160/models/run_01/best_model_robust.pt")

# =============== MODEL ARCHITECTURES ===============

# --- Basic building blocks ---
class ConvBlock(nn.Module):
    def __init__(self, cin, cout, k=(3,7), s=(1,1), p=None):
        super().__init__()
        p = p or (k[0]//2, k[1]//2)
        self.conv = nn.Conv2d(cin, cout, k, s, p)
        self.bn = nn.BatchNorm2d(cout)
        self.act = nn.SiLU()
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class ResidualBlock(nn.Module):
    """Residual connection for improved gradient flow"""
    def __init__(self, channels, k=(3,3)):
        super().__init__()
        p = (k[0]//2, k[1]//2)
        self.conv1 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn1 = nn.BatchNorm2d(channels)
        self.act1 = nn.SiLU()
        self.conv2 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn2 = nn.BatchNorm2d(channels)
        self.act2 = nn.SiLU()

    def forward(self, x):
        identity = x
        out = self.act1(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + identity
        return self.act2(out)

# --- NEW Improved Architecture (with residuals) ---
class MultiTaskModel_v2(nn.Module):
    """
    Improved architecture with residual connections
    5 stages: stem -> b1 -> b2 -> b3 -> b4
    Channels: [32, 64, 128, 256, 512]
    """
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 256, 512]

        self.stem = ConvBlock(1, C[0], (3,7))
        self.res_stem = ResidualBlock(C[0], (3,3))

        self.b1 = ConvBlock(C[0], C[1], (3,5))
        self.res1 = ResidualBlock(C[1], (3,3))

        self.b2 = ConvBlock(C[1], C[2], (3,5))
        self.res2 = ResidualBlock(C[2], (3,3))

        self.b3 = ConvBlock(C[2], C[3], (3,3))
        self.res3 = ResidualBlock(C[3], (3,3))

        self.b4 = ConvBlock(C[3], C[4], (3,3))
        self.res4 = ResidualBlock(C[4], (3,3))

        self.pool = nn.AdaptiveAvgPool2d((1,1))

        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(C[4], 256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_targets)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.res_stem(x)

        x = self.b1(x)
        x = self.res1(x)

        x = self.b2(x)
        x = self.res2(x)
        x = nn.functional.max_pool2d(x, 2)

        x = self.b3(x)
        x = self.res3(x)
        x = nn.functional.max_pool2d(x, 2)

        x = self.b4(x)
        x = self.res4(x)

        x = self.pool(x)
        return self.head(x)

# --- Original Architecture (4 stages, no residuals) ---
class MultiTaskModel_v1(nn.Module):
    """Original architecture from first training"""
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 192]
        self.stem = ConvBlock(1, C[0], (3,7))
        self.b1   = ConvBlock(C[0], C[1], (3,5))
        self.b2   = ConvBlock(C[1], C[2], (3,5))
        self.b3   = ConvBlock(C[2], C[3], (3,3))
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(C[3], 256),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(256, n_targets)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x)
        x = nn.functional.max_pool2d(x, 2)
        x = self.b2(x)
        x = nn.functional.max_pool2d(x, 2)
        x = self.b3(x)
        x = self.pool(x)
        return self.head(x)

# --------------------------- safe checkpoint load ---------------------------
def load_checkpoint_safe(path, device):
    """Try weights_only=True, then allowlist numpy, finally fallback"""
    try:
        return torch.load(path, map_location=device, weights_only=True)
    except Exception:
        try:
            from torch.serialization import add_safe_globals
            add_safe_globals([np._core.multiarray.scalar, np.dtype])
            return torch.load(path, map_location=device, weights_only=True)
        except Exception:
            print("[warn] Falling back to weights_only=False (trusted local checkpoint).")
            return torch.load(path, map_location=device, weights_only=False)

# ---------- robust checkpoint extraction ----------
def extract_state_targets_cfg(ckpt):
    """Return (state_dict, targets:list[str], cfg:dict)"""
    if isinstance(ckpt, torch.nn.Module):
        return ckpt.state_dict(), ["quality_final"], {}

    if not isinstance(ckpt, dict):
        raise RuntimeError(f"Unsupported checkpoint type: {type(ckpt)}")

    # Find state dict
    candidate_keys = [
        "model_state_dict", "state_dict", "ema_state_dict",
        "model", "net", "weights", "params"
    ]
    state = None
    for k in candidate_keys:
        v = ckpt.get(k)
        if isinstance(v, dict) and all(isinstance(x, torch.Tensor) for x in v.values()):
            state = v
            break

    if state is None and all(isinstance(v, torch.Tensor) for v in ckpt.values()):
        state = ckpt

    if state is None:
        raise RuntimeError("Could not locate model state_dict in checkpoint.")

    # Strip common prefixes
    STRIP_PREFIXES = ("module.", "_orig_mod.", "model.", "net.")
    def _strip(k: str) -> str:
        for p in STRIP_PREFIXES:
            if k.startswith(p):
                return k[len(p):]
        return k
    state = {_strip(k): v for k, v in state.items()}

    targets = ckpt.get("targets") or ckpt.get("target_names") or ["quality_final"]
    cfg = ckpt.get("config") or ckpt.get("cfg") or {}

    return state, targets, cfg

# ---------------------- Load checkpoint ----------------------
print("\n[1/4] Loading checkpoint...")
checkpoint = load_checkpoint_safe(ckpt_path, device)
state, TARGETS, cfg = extract_state_targets_cfg(checkpoint)

# Get config values
DEFAULT_MEL_BINS = 128
DEFAULT_FRAMES   = 768
MEL_BINS = int(cfg.get("mel_bins", DEFAULT_MEL_BINS))
FRAMES   = int(cfg.get("frames", DEFAULT_FRAMES))

print(f"✓ Checkpoint loaded from Google Drive")
print(f"  Targets: {TARGETS}")
print(f"  MEL_BINS: {MEL_BINS}, FRAMES: {FRAMES}")

# ---------------------- Detect architecture ----------------------
def _has_prefix_keys(sdict, prefix: str) -> bool:
    return any(k.startswith(prefix) for k in sdict.keys())

_sample_keys = list(state.keys())[:20]
print(f"\n[DEBUG] Sample checkpoint keys: {_sample_keys[:10]}...")

# Auto-detect architecture based on checkpoint keys
has_res_blocks = _has_prefix_keys(state, "res_stem.") or _has_prefix_keys(state, "res1.")
has_b4 = _has_prefix_keys(state, "b4.")

if has_res_blocks and has_b4:
    print("[arch] ✓ Detected MultiTaskModel_v2 (5 stages with residuals)")
    model = MultiTaskModel_v2(n_targets=len(TARGETS))
    strict = True
elif _has_prefix_keys(state, "stem.") and _has_prefix_keys(state, "head."):
    if has_b4:
        print("[arch] ✓ Detected MultiTaskModel with b4 (trying v2)")
        model = MultiTaskModel_v2(n_targets=len(TARGETS))
        strict = False
    else:
        print("[arch] ✓ Detected MultiTaskModel_v1 (4 stages, no residuals)")
        model = MultiTaskModel_v1(n_targets=len(TARGETS))
        strict = True
else:
    print("[arch] ⚠️ Unknown architecture, trying v2 as fallback")
    model = MultiTaskModel_v2(n_targets=len(TARGETS))
    strict = False

# Load weights
missing, unexpected = model.load_state_dict(state, strict=strict)

if missing:
    print(f"[warn] Missing keys ({len(missing)}): {missing[:5]}")
if unexpected:
    print(f"[warn] Unexpected keys ({len(unexpected)}): {unexpected[:5]}")

model.to(device).eval()
print(f"✓ Model loaded: {model.__class__.__name__} on {device}")

# ==================== AUDIO PROCESSING ====================
print("\n[2/4] Setting up audio processing...")

import librosa, tempfile, subprocess, os

SAMPLE_RATE = 16000
N_FFT = 2048
HOP_LENGTH = 512

def _convert_to_wav_ffmpeg(audio_path):
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tmp.close()
    cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ac", "1", "-ar", str(SAMPLE_RATE), tmp.name]
    try:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return tmp.name
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"ffmpeg failed: {e}") from e

def safe_load_audio(path):
    try:
        y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
        if y is None or len(y) == 0:
            raise RuntimeError("empty audio")
        return y, sr, "librosa"
    except Exception:
        wav = None
        try:
            wav = _convert_to_wav_ffmpeg(path)
            y, sr = librosa.load(wav, sr=SAMPLE_RATE, mono=True)
            if y is None or len(y) == 0:
                raise RuntimeError("empty audio after ffmpeg")
            return y, sr, "ffmpeg->librosa"
        finally:
            if wav and os.path.exists(wav):
                try:
                    os.unlink(wav)
                except:
                    pass

def make_logmel(y, sr=SAMPLE_RATE):
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=MEL_BINS, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max)

    # Crop/pad to FRAMES
    T = S_db.shape[1]
    if T >= FRAMES:
        start = max(0, (T - FRAMES)//2)
        S_db = S_db[:, start:start+FRAMES]
    else:
        pad = FRAMES - T
        S_db = np.pad(S_db, ((0,0),(pad//2, pad - pad//2)), mode="edge")

    # Normalize
    m = float(np.mean(S_db))
    s = float(np.std(S_db))
    if not np.isfinite(m) or not np.isfinite(s) or s < 1e-6:
        s = 1.0
    Z = (S_db - m) / s
    np.clip(Z, -10, 10, out=Z)

    if not np.isfinite(Z).all():
        raise RuntimeError("non-finite after normalization")

    return torch.from_numpy(Z.astype(np.float32)).unsqueeze(0).unsqueeze(0)

print("✓ Audio processing ready")

# ==================== PREDICT ====================
print("\n[3/4] Predicting quality for your songs...")
print("="*70)

@torch.no_grad()
def predict_one(audio_path: Path):
    y, sr, how = safe_load_audio(str(audio_path))
    x = make_logmel(y, sr).to(device)
    pred = model(x).cpu().numpy()[0]
    return pred, how

results = []
errors  = []

for i, (audio_file, nice_name) in enumerate(zip(test_files, song_names), 1):
    print("\n" + "="*70)
    print(f"SONG {i}: {nice_name}")
    print("="*70)

    try:
        pred, how = predict_one(audio_file)
        print(f"  ✓ Loaded via: {how}")

        for j, t in enumerate(TARGETS):
            val = float(pred[j])
            print(f"\n{t.upper().replace('_',' ')}:\n  Predicted: {val:7.3f}")

        results.append({
            "song": nice_name,
            "file": audio_file.name,
            "pred": pred.tolist()
        })
    except Exception as e:
        print(f"❌ Failed: {audio_file.name} — {e}")
        errors.append((audio_file.name, str(e)))

# ==================== COMPARE ====================
print("\n[4/4] Comparing your songs...")
print("="*70)

if len(results) >= 2:
    a, b = results[0], results[1]
    print(f"\nSong 1: {a['song']}\nSong 2: {b['song']}")

    diffs = []
    for j, t in enumerate(TARGETS):
        v1, v2 = a["pred"][j], b["pred"][j]
        diff = abs(v1 - v2)
        diffs.append(diff)
        print(f"\n{t.upper().replace('_',' ')}:")
        print(f"  Song 1: {v1:7.3f}")
        print(f"  Song 2: {v2:7.3f}")
        print(f"  Diff:   {diff:7.3f}")

    print("\n" + "="*70)
    print(f"AVERAGE DIFFERENCE: {np.mean(diffs):.3f}")

elif len(results) == 1:
    print("\n⚠️ Only 1 song processed successfully — add another file for comparison.")
else:
    print("\n❌ No songs processed — see errors above.")

# Save results to Google Drive
if results:
    import json
    outp = Path("/content/drive/MyDrive/STA 160/models/run_01/custom_songs_test.json")
    outp.parent.mkdir(parents=True, exist_ok=True)
    with open(outp, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\n✓ Results saved to Google Drive: {outp}")

print("\n" + "="*70)
print("✅ EVALUATION COMPLETE!")
print("="*70)

Testing these files:
  1. ✓ Adele_-_Hello_Original_-_yana_enik_(mp3.pm) (.mp3 6.7 MB)
  2. ✓ DJ_Smash_feat._Ridley_-_The_Night_Is_Young_(mp3.pm) (.mp3 6.7 MB)
  3. ✓ 《孤勇者》（《英雄聯盟：雙城之戰》動畫劇集中文主題曲）陳奕迅 Eason Chan [Official MV] (.mp3 4.2 MB)

[1/4] Loading checkpoint...
✓ Checkpoint loaded from Google Drive
  Targets: ['views_log', 'likes_log', 'comments_log']
  MEL_BINS: 128, FRAMES: 768

[DEBUG] Sample checkpoint keys: ['stem.conv.weight', 'stem.conv.bias', 'stem.bn.weight', 'stem.bn.bias', 'stem.bn.running_mean', 'stem.bn.running_var', 'stem.bn.num_batches_tracked', 'res_stem.conv1.weight', 'res_stem.conv1.bias', 'res_stem.bn1.weight']...
[arch] ✓ Detected MultiTaskModel_v2 (5 stages with residuals)
✓ Model loaded: MultiTaskModel_v2 on cuda

[2/4] Setting up audio processing...
✓ Audio processing ready

[3/4] Predicting quality for your songs...

SONG 1: Adele_-_Hello_Original_-_yana_enik_(mp3.pm)
  ✓ Loaded via: librosa

VIEWS LOG:
  Predicted:  14.373

LIKES LOG:
  Predicted:  10.044

C

In [None]:
# Add this to the END of your Cell 8.5 (after the comparison section)

# ==================== BONUS: CONVERT TO REAL NUMBERS ====================
if any('_log' in t for t in TARGETS):
    print("\n" + "="*70)
    print("[BONUS] Converting Log Predictions to Real Numbers")
    print("="*70)

    for i, res in enumerate(results, 1):
        print(f"\n{'='*70}")
        print(f"SONG {i}: {res['song']}")
        print('='*70)

        # Find indices
        views_idx = TARGETS.index('views_log') if 'views_log' in TARGETS else None
        likes_idx = TARGETS.index('likes_log') if 'likes_log' in TARGETS else None
        comments_idx = TARGETS.index('comments_log') if 'comments_log' in TARGETS else None

        # Convert and display
        if views_idx is not None:
            log_val = res['pred'][views_idx]
            actual = np.expm1(log_val)

            if actual >= 1e9:
                display = f"{actual/1e9:.2f}B"
            elif actual >= 1e6:
                display = f"{actual/1e6:.2f}M"
            elif actual >= 1e3:
                display = f"{actual/1e3:.0f}K"
            else:
                display = f"{actual:.0f}"

            print(f"\n📊 VIEWS:")
            print(f"  Log value:    {log_val:.3f}")
            print(f"  Actual views: {display:>10s}  ({actual:,.0f})")

        if likes_idx is not None:
            log_val = res['pred'][likes_idx]
            actual = np.expm1(log_val)

            if actual >= 1e6:
                display = f"{actual/1e6:.2f}M"
            elif actual >= 1e3:
                display = f"{actual/1e3:.1f}K"
            else:
                display = f"{actual:.0f}"

            print(f"\n👍 LIKES:")
            print(f"  Log value:    {log_val:.3f}")
            print(f"  Actual likes: {display:>10s}  ({actual:,.0f})")

        if comments_idx is not None:
            log_val = res['pred'][comments_idx]
            actual = np.expm1(log_val)

            if actual >= 1e3:
                display = f"{actual/1e3:.1f}K"
            else:
                display = f"{actual:.0f}"

            print(f"\n💬 COMMENTS:")
            print(f"  Log value:       {log_val:.3f}")
            print(f"  Actual comments: {display:>10s}  ({actual:,.0f})")

        # Engagement ratio
        if views_idx is not None and likes_idx is not None:
            views_actual = np.expm1(res['pred'][views_idx])
            likes_actual = np.expm1(res['pred'][likes_idx])
            ratio = (likes_actual / views_actual * 100) if views_actual > 0 else 0
            print(f"\n📈 ENGAGEMENT:")
            print(f"  Like ratio: {ratio:.2f}%")

    # Summary comparison
    if len(results) >= 2:
        print("\n" + "="*70)
        print("POPULARITY COMPARISON")
        print("="*70)

        if views_idx is not None:
            views_actual = [np.expm1(r['pred'][views_idx]) for r in results]

            # Sort by views
            sorted_idx = sorted(range(len(results)), key=lambda i: views_actual[i], reverse=True)

            print("\nRanked by Predicted Views:")
            for rank, idx in enumerate(sorted_idx, 1):
                actual = views_actual[idx]
                if actual >= 1e6:
                    display = f"{actual/1e6:.2f}M"
                elif actual >= 1e3:
                    display = f"{actual/1e3:.0f}K"
                else:
                    display = f"{actual:.0f}"

                name = results[idx]['song'][:50]  # Truncate long names
                print(f"  {rank}. {name:<50s} {display:>10s}")

            # Range
            max_views = max(views_actual)
            min_views = min(views_actual)
            ratio = max_views / min_views if min_views > 0 else float('inf')

            print(f"\n📊 Spread Analysis:")
            print(f"  Most popular: {max_views/1e6:.2f}M views")
            print(f"  Least popular: {min_views/1e6:.2f}M views")
            print(f"  Ratio: {ratio:.2f}x")

            if ratio < 1.5:
                verdict = "Similar popularity (all average)"
            elif ratio < 3:
                verdict = "Clear winner (1 standout song)"
            elif ratio < 10:
                verdict = "Very different (wide popularity gap)"
            else:
                verdict = "Extreme difference (viral vs unknown)"

            print(f"  Verdict: {verdict}")

print("\n" + "="*70)
print("✅ EVALUATION COMPLETE!")
print("="*70)


[BONUS] Converting Log Predictions to Real Numbers

SONG 1: Adele_-_Hello_Original_-_yana_enik_(mp3.pm)

📊 VIEWS:
  Log value:    14.373
  Actual views:      1.75M  (1,745,879)

👍 LIKES:
  Log value:    10.044
  Actual likes:      23.0K  (23,017)

💬 COMMENTS:
  Log value:       6.185
  Actual comments:        484  (484)

📈 ENGAGEMENT:
  Like ratio: 1.32%

SONG 2: DJ_Smash_feat._Ridley_-_The_Night_Is_Young_(mp3.pm)

📊 VIEWS:
  Log value:    15.005
  Actual views:      3.29M  (3,285,917)

👍 LIKES:
  Log value:    10.498
  Actual likes:      36.3K  (36,259)

💬 COMMENTS:
  Log value:       6.538
  Actual comments:        690  (690)

📈 ENGAGEMENT:
  Like ratio: 1.10%

SONG 3: 《孤勇者》（《英雄聯盟：雙城之戰》動畫劇集中文主題曲）陳奕迅 Eason Chan [Official MV]

📊 VIEWS:
  Log value:    14.452
  Actual views:      1.89M  (1,889,662)

👍 LIKES:
  Log value:    10.092
  Actual likes:      24.2K  (24,157)

💬 COMMENTS:
  Log value:       6.206
  Actual comments:        495  (495)

📈 ENGAGEMENT:
  Like ratio: 1.28%

POPULARIT

In [None]:
# ======================================================================
# CELL 9 — Comprehensive Model Validation & Metrics
# ======================================================================
import numpy as np
import pandas as pd
from pathlib import Path
import torch
import torch.nn as nn
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    precision_score, recall_score, f1_score, accuracy_score,
    confusion_matrix, classification_report
)
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

print("="*70)
print("COMPREHENSIVE MODEL VALIDATION")
print("="*70)

# ---------- Configuration ----------
DRIVE_BASE = Path("/content/drive/MyDrive/STA 160")
MODEL_DIR = DRIVE_BASE / "models/audio_features"
ckpt_path = MODEL_DIR / "/models/run_01/best_model_robust.pt"

# Load validation data (ground truth)
val_data_path = DRIVE_BASE / "models/run_01/META_audio_features.parquet"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ======================== LOAD MODEL ========================

print("\n[1/6] Loading model...")
print("="*70)

# Import model architecture (same as training)
class ConvBlock(nn.Module):
    def __init__(self, cin, cout, k=(3,7), s=(1,1), p=None):
        super().__init__()
        p = p or (k[0]//2, k[1]//2)
        self.conv = nn.Conv2d(cin, cout, k, s, p)
        self.bn = nn.BatchNorm2d(cout)
        self.act = nn.SiLU()
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class ResidualBlock(nn.Module):
    def __init__(self, channels, k=(3,3)):
        super().__init__()
        p = (k[0]//2, k[1]//2)
        self.conv1 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn1 = nn.BatchNorm2d(channels)
        self.act1 = nn.SiLU()
        self.conv2 = nn.Conv2d(channels, channels, k, padding=p)
        self.bn2 = nn.BatchNorm2d(channels)
        self.act2 = nn.SiLU()

    def forward(self, x):
        identity = x
        out = self.act1(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + identity
        return self.act2(out)

class AudioFeatureModel(nn.Module):
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 256, 512]

        self.stem = ConvBlock(1, C[0], (3,7))
        self.res_stem = ResidualBlock(C[0], (3,3))

        self.b1 = ConvBlock(C[0], C[1], (3,5))
        self.res1 = ResidualBlock(C[1], (3,3))

        self.b2 = ConvBlock(C[1], C[2], (3,5))
        self.res2 = ResidualBlock(C[2], (3,3))

        self.b3 = ConvBlock(C[2], C[3], (3,3))
        self.res3 = ResidualBlock(C[3], (3,3))

        self.b4 = ConvBlock(C[3], C[4], (3,3))
        self.res4 = ResidualBlock(C[4], (3,3))

        self.pool = nn.AdaptiveAvgPool2d((1,1))

        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(C[4], 256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_targets),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.res_stem(x)

        x = self.b1(x)
        x = self.res1(x)

        x = self.b2(x)
        x = self.res2(x)
        x = nn.functional.max_pool2d(x, 2)

        x = self.b3(x)
        x = self.res3(x)
        x = nn.functional.max_pool2d(x, 2)

        x = self.b4(x)
        x = self.res4(x)

        x = self.pool(x)
        return self.head(x)

# Load checkpoint
def load_checkpoint_safe(path, device):
    try:
        return torch.load(path, map_location=device, weights_only=True)
    except Exception:
        try:
            from torch.serialization import add_safe_globals
            add_safe_globals([np._core.multiarray.scalar, np.dtype])
            return torch.load(path, map_location=device, weights_only=True)
        except Exception:
            return torch.load(path, map_location=device, weights_only=False)

checkpoint = load_checkpoint_safe(ckpt_path, device)
state = checkpoint.get("model_state_dict", checkpoint)
TARGETS = checkpoint.get("targets", [])
cfg = checkpoint.get("config", {})

MEL_BINS = int(cfg.get("mel_bins", 128))
FRAMES = int(cfg.get("frames", 768))

model = AudioFeatureModel(n_targets=len(TARGETS)).to(device)
model.load_state_dict(state)
model.eval()

print(f"✓ Model loaded: {len(TARGETS)} targets")
print(f"  Targets: {TARGETS}")

# ======================== LOAD VALIDATION DATA ========================

print("\n[2/6] Loading validation data...")
print("="*70)

df = pd.read_parquet(val_data_path)

# Split into train/val (use same seed as training)
np.random.seed(2025)
idx = np.arange(len(df))
np.random.shuffle(idx)
val_split = 0.15
cut = int(len(idx) * (1 - val_split))
val_idx = idx[cut:]

df_val = df.iloc[val_idx].reset_index(drop=True)

# Filter: must have feature_path and all targets
has_features = df_val['feature_path'].notna()
has_all_targets = df_val[TARGETS].notna().all(axis=1)
df_val = df_val[has_features & has_all_targets].copy()

print(f"✓ Validation samples: {len(df_val):,}")

# ======================== AUDIO PROCESSING ========================

print("\n[3/6] Setting up audio processing...")
print("="*70)

import librosa

SAMPLE_RATE = 16000
N_FFT = 2048
HOP_LENGTH = 512

def make_logmel(audio_path, sr=SAMPLE_RATE):
    try:
        y, _ = librosa.load(audio_path, sr=sr, mono=True)

        S = librosa.feature.melspectrogram(
            y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH,
            n_mels=MEL_BINS, power=2.0
        )
        S_db = librosa.power_to_db(S, ref=np.max)

        T = S_db.shape[1]
        if T >= FRAMES:
            start = max(0, (T - FRAMES)//2)
            S_db = S_db[:, start:start+FRAMES]
        else:
            pad = FRAMES - T
            S_db = np.pad(S_db, ((0,0),(pad//2, pad - pad//2)), mode="edge")

        m = float(np.mean(S_db))
        s = float(np.std(S_db))
        if not np.isfinite(s) or s < 1e-6:
            s = 1.0
        Z = (S_db - m) / s
        np.clip(Z, -10, 10, out=Z)

        return torch.from_numpy(Z.astype(np.float32)).unsqueeze(0).unsqueeze(0)
    except Exception as e:
        return None

print("✓ Audio processing ready")

# ======================== MAKE PREDICTIONS ========================

print("\n[4/6] Making predictions on validation set...")
print("="*70)

predictions = []
ground_truth = []
failed = 0

from tqdm import tqdm

for idx, row in tqdm(df_val.iterrows(), total=len(df_val), desc="Predicting"):
    audio_path = row['feature_path']

    # Load audio and predict
    x = make_logmel(audio_path)
    if x is None:
        failed += 1
        continue

    with torch.no_grad():
        x = x.to(device)
        pred = model(x).cpu().numpy()[0]

    predictions.append(pred)
    ground_truth.append([row[t] for t in TARGETS])

predictions = np.array(predictions)
ground_truth = np.array(ground_truth)

print(f"\n✓ Predictions: {len(predictions):,} samples")
print(f"✗ Failed: {failed} samples")

# ======================== COMPUTE METRICS ========================

print("\n[5/6] Computing metrics...")
print("="*70)

# Feature name mapping
FEATURE_NAMES = {
    'danceability_norm': 'Danceability',
    'energy_norm': 'Energy',
    'loudness_norm': 'Loudness',
    'speechiness_norm': 'Speechiness',
    'acousticness_norm': 'Acousticness',
    'instrumentalness_norm': 'Instrumentalness',
    'liveness_norm': 'Liveness',
    'valence_norm': 'Valence',
    'tempo_norm': 'Tempo',
}

results = {}

print("\n" + "="*70)
print("REGRESSION METRICS (Per Feature)")
print("="*70)

for i, target in enumerate(TARGETS):
    y_true = ground_truth[:, i]
    y_pred = predictions[:, i]

    # Regression metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Correlation
    pearson_r, pearson_p = pearsonr(y_true, y_pred)
    spearman_r, spearman_p = spearmanr(y_true, y_pred)

    # Custom: Within-tolerance accuracy
    tolerance_5 = np.mean(np.abs(y_true - y_pred) < 0.05)
    tolerance_10 = np.mean(np.abs(y_true - y_pred) < 0.10)
    tolerance_15 = np.mean(np.abs(y_true - y_pred) < 0.15)

    results[target] = {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'pearson_r': pearson_r,
        'spearman_r': spearman_r,
        'within_5%': tolerance_5,
        'within_10%': tolerance_10,
        'within_15%': tolerance_15,
    }

    nice_name = FEATURE_NAMES.get(target, target)

    print(f"\n{nice_name}:")
    print(f"  RMSE:  {rmse:.4f}  (lower is better)")
    print(f"  MAE:   {mae:.4f}  (lower is better)")
    print(f"  R²:    {r2:.4f}  (higher is better, max=1.0)")
    print(f"  Pearson r:  {pearson_r:.4f}")
    print(f"  Spearman r: {spearman_r:.4f}")
    print(f"  Within ±0.05: {tolerance_5*100:.1f}%")
    print(f"  Within ±0.10: {tolerance_10*100:.1f}%")
    print(f"  Within ±0.15: {tolerance_15*100:.1f}%")

# Overall metrics
overall_mse = mean_squared_error(ground_truth.flatten(), predictions.flatten())
overall_rmse = np.sqrt(overall_mse)
overall_mae = mean_absolute_error(ground_truth.flatten(), predictions.flatten())
overall_r2 = r2_score(ground_truth.flatten(), predictions.flatten())

print("\n" + "="*70)
print("OVERALL METRICS (All Features)")
print("="*70)
print(f"RMSE:  {overall_rmse:.4f}")
print(f"MAE:   {overall_mae:.4f}")
print(f"R²:    {overall_r2:.4f}")

# ======================== CLASSIFICATION METRICS ========================

print("\n" + "="*70)
print("CLASSIFICATION METRICS (Discretized)")
print("="*70)
print("(Binning: Low=0-0.33, Medium=0.33-0.67, High=0.67-1.0)")
print("="*70)

def discretize(values):
    """Convert continuous 0-1 to Low/Medium/High"""
    return np.select(
        [values < 0.33, values < 0.67],
        [0, 1],  # 0=Low, 1=Medium, 2=High
        default=2
    )

for i, target in enumerate(TARGETS):
    y_true = ground_truth[:, i]
    y_pred = predictions[:, i]

    y_true_class = discretize(y_true)
    y_pred_class = discretize(y_pred)

    # Classification metrics
    accuracy = accuracy_score(y_true_class, y_pred_class)
    precision = precision_score(y_true_class, y_pred_class, average='weighted', zero_division=0)
    recall = recall_score(y_true_class, y_pred_class, average='weighted', zero_division=0)
    f1 = f1_score(y_true_class, y_pred_class, average='weighted', zero_division=0)

    nice_name = FEATURE_NAMES.get(target, target)

    print(f"\n{nice_name}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")

# Overall classification metrics
y_true_all_class = discretize(ground_truth.flatten())
y_pred_all_class = discretize(predictions.flatten())

overall_accuracy = accuracy_score(y_true_all_class, y_pred_all_class)
overall_precision = precision_score(y_true_all_class, y_pred_all_class, average='weighted')
overall_recall = recall_score(y_true_all_class, y_pred_all_class, average='weighted')
overall_f1 = f1_score(y_true_all_class, y_pred_all_class, average='weighted')

print("\n" + "="*70)
print("OVERALL CLASSIFICATION METRICS")
print("="*70)
print(f"Accuracy:  {overall_accuracy:.4f}")
print(f"Precision: {overall_precision:.4f}")
print(f"Recall:    {overall_recall:.4f}")
print(f"F1-Score:  {overall_f1:.4f}")

# ======================== VISUALIZATIONS ========================

print("\n[6/6] Creating visualizations...")
print("="*70)

# Create figure with subplots
n_features = len(TARGETS)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]

for i, target in enumerate(TARGETS):
    y_true = ground_truth[:, i]
    y_pred = predictions[:, i]

    ax = axes[i]

    # Scatter plot with regression line
    ax.scatter(y_true, y_pred, alpha=0.3, s=10)
    ax.plot([0, 1], [0, 1], 'r--', linewidth=2, label='Perfect prediction')

    # Add trend line
    z = np.polyfit(y_true, y_pred, 1)
    p = np.poly1d(z)
    ax.plot([0, 1], p([0, 1]), "b-", linewidth=2, alpha=0.5, label=f'Trend')

    nice_name = FEATURE_NAMES.get(target, target)
    r2 = results[target]['r2']
    mae = results[target]['mae']

    ax.set_xlabel('True Value')
    ax.set_ylabel('Predicted Value')
    ax.set_title(f'{nice_name}\nR²={r2:.3f}, MAE={mae:.3f}')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(-0.05, 1.05)
    ax.set_ylim(-0.05, 1.05)

# Hide extra subplots
for i in range(n_features, len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plot_path = MODEL_DIR / "validation_scatter_plots.png"
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"✓ Saved scatter plots: {plot_path}")
plt.close()

# ======================== ERROR ANALYSIS ========================

print("\n" + "="*70)
print("ERROR ANALYSIS")
print("="*70)

errors = np.abs(ground_truth - predictions)

print("\nWorst predictions by feature:")
for i, target in enumerate(TARGETS):
    nice_name = FEATURE_NAMES.get(target, target)
    max_error = errors[:, i].max()
    mean_error = errors[:, i].mean()
    median_error = np.median(errors[:, i])

    print(f"\n{nice_name}:")
    print(f"  Mean error:   {mean_error:.4f}")
    print(f"  Median error: {median_error:.4f}")
    print(f"  Max error:    {max_error:.4f}")

    # Find worst prediction
    worst_idx = errors[:, i].argmax()
    print(f"  Worst case:")
    print(f"    True:  {ground_truth[worst_idx, i]:.3f}")
    print(f"    Pred:  {predictions[worst_idx, i]:.3f}")
    print(f"    Error: {errors[worst_idx, i]:.3f}")

# ======================== SAVE RESULTS ========================

print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

# Save metrics to CSV
metrics_df = pd.DataFrame(results).T
metrics_df.index.name = 'feature'
metrics_csv_path = MODEL_DIR / "validation_metrics.csv"
metrics_df.to_csv(metrics_csv_path)
print(f"✓ Saved metrics: {metrics_csv_path}")

# Save detailed results
detailed_results = {
    'predictions': predictions.tolist(),
    'ground_truth': ground_truth.tolist(),
    'targets': TARGETS,
    'overall_metrics': {
        'rmse': float(overall_rmse),
        'mae': float(overall_mae),
        'r2': float(overall_r2),
        'accuracy': float(overall_accuracy),
        'precision': float(overall_precision),
        'recall': float(overall_recall),
        'f1_score': float(overall_f1),
    },
    'per_feature_metrics': {
        target: {k: float(v) for k, v in metrics.items()}
        for target, metrics in results.items()
    }
}

import json
results_json_path = MODEL_DIR / "validation_results.json"
with open(results_json_path, 'w') as f:
    json.dump(detailed_results, f, indent=2)
print(f"✓ Saved detailed results: {results_json_path}")

# ======================== SUMMARY ========================

print("\n" + "="*70)
print("VALIDATION SUMMARY")
print("="*70)

print(f"\n📊 Dataset:")
print(f"  Validation samples: {len(predictions):,}")
print(f"  Features evaluated: {len(TARGETS)}")

print(f"\n📈 Overall Performance:")
print(f"  RMSE:      {overall_rmse:.4f}  {'✓ Excellent' if overall_rmse < 0.05 else '✓ Good' if overall_rmse < 0.10 else '⚠ Needs improvement'}")
print(f"  MAE:       {overall_mae:.4f}  {'✓ Excellent' if overall_mae < 0.04 else '✓ Good' if overall_mae < 0.08 else '⚠ Needs improvement'}")
print(f"  R²:        {overall_r2:.4f}  {'✓ Excellent' if overall_r2 > 0.85 else '✓ Good' if overall_r2 > 0.70 else '⚠ Needs improvement'}")
print(f"  F1-Score:  {overall_f1:.4f}  {'✓ Excellent' if overall_f1 > 0.85 else '✓ Good' if overall_f1 > 0.70 else '⚠ Needs improvement'}")

print(f"\n🎯 Best Performing Features (by R²):")
sorted_features = sorted(results.items(), key=lambda x: x[1]['r2'], reverse=True)
for target, metrics in sorted_features[:3]:
    nice_name = FEATURE_NAMES.get(target, target)
    print(f"  {nice_name}: R²={metrics['r2']:.4f}, MAE={metrics['mae']:.4f}")

print(f"\n⚠️ Needs Improvement (by R²):")
for target, metrics in sorted_features[-3:]:
    nice_name = FEATURE_NAMES.get(target, target)
    print(f"  {nice_name}: R²={metrics['r2']:.4f}, MAE={metrics['mae']:.4f}")

print("\n" + "="*70)
print("✅ VALIDATION COMPLETE!")
print("="*70)
print(f"\nFiles saved:")
print(f"  • Metrics CSV: {metrics_csv_path}")
print(f"  • Results JSON: {results_json_path}")
print(f"  • Scatter plots: {plot_path}")
print("="*70)

MODEL EVALUATION - SIMPLIFIED
Using device: cpu

[1/5] Loading model...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/STA 160/models/run_01/best_model_robust.pt'

below are old version


In [None]:
# ================================================================
# CELL 6 — Fast, Robust Trainer (prefetch_factor-safe + autosave)
# ================================================================
import os, time, json, random, numpy as np, pandas as pd
from pathlib import Path
from typing import Optional, Tuple, List

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast

# ---------- Paths / Config ----------
OUTDIR = Path("/content/models/run_01"); OUTDIR.mkdir(parents=True, exist_ok=True)
with open(OUTDIR / "config.json", "r") as f: config = json.load(f)

master_file = Path(config.get("master_file", OUTDIR / "META_training_ready.parquet"))
if not master_file.exists() and (OUTDIR / "META_training_ready.parquet").exists():
    master_file = OUTDIR / "META_training_ready.parquet"

print("="*70); print("CELL 6 — Fast, Robust Trainer"); print("="*70)
print(f"Using master: {master_file}")

# ---------- Hyperparams ----------
MEL_BINS  = 128
FRAMES    = 768
VAL_SPLIT = 0.15
BATCH_SIZE = 48
EPOCHS     = 30
LR, WD     = 1e-3, 1e-4
MAX_GRAD_NORM = 1.0
SEED = 2025

USE_CUDA = torch.cuda.is_available()
DEVICE   = torch.device("cuda" if USE_CUDA else "cpu")
AMP_EN   = USE_CUDA

if USE_CUDA:
    torch.backends.cudnn.benchmark = True
    torch.set_float32_matmul_precision("high")

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if USE_CUDA: torch.cuda.manual_seed_all(SEED)

# ---------- Load data ----------
df = pd.read_parquet(master_file)
print(f"✓ Loaded {len(df):,} master rows")

# prefer signals, else counts
CAND_SETS = [
    ["user_engagement_signal","platform_quality_signal","quality_final"],
    ["views","likes","comments"],
]
for cand in CAND_SETS:
    if all(c in df.columns for c in cand):
        TARGETS = cand; break
else:
    raise RuntimeError("No valid target set in master file.")
print(f"✓ Targets: {TARGETS}")

idx = np.arange(len(df)); np.random.shuffle(idx)
cut = int(len(idx)*(1-VAL_SPLIT))
df_train, df_val = df.iloc[idx[:cut]].reset_index(drop=True), df.iloc[idx[cut:]].reset_index(drop=True)
print(f"✓ Train: {len(df_train):,} | Val: {len(df_val):,}")

target_stats = {t: {"mean": float(pd.to_numeric(df[t], errors="coerce").mean()),
                    "std" : float(pd.to_numeric(df[t], errors="coerce").std() or 1.0)}
                for t in TARGETS}

# ---------- Feature loader ----------
def _finite(arr): return np.isfinite(arr).all()

def load_feature(fp: str, frames: int = FRAMES, center: bool = False) -> Optional[np.ndarray]:
    try:
        with np.load(fp, allow_pickle=False, mmap_mode="r" if USE_CUDA else None) as z:
            key = next((k for k in ("logmel","log_mel","mel","features","x","S") if k in z.files), None)
            if key is None:
                return None
            x = z[key]

        # Squeeze and orient to [F, T]
        if x.ndim == 3:
            x = x.squeeze()
        if x.ndim != 2:
            return None
        if x.shape[0] != MEL_BINS and x.shape[1] == MEL_BINS:
            x = x.T
        if x.shape[0] != MEL_BINS:
            return None

        # --- Key changes start here ---
        # Force a safe float dtype before any magnitude checks
        if x.dtype != np.float32:
            x = x.astype(np.float32, copy=False)

        # Fast finite check first
        if not np.isfinite(x).all():
            return None

        # Suppress noisy overflow warnings during the magnitude guard
        with np.errstate(over="ignore", invalid="ignore"):
            mx = np.nanmax(np.abs(x))
        if not np.isfinite(mx) or mx > 1e6:
            return None
        # --- Key changes end here ---

        # Center/Random crop or pad to FRAMES
        T = x.shape[1]
        if T >= frames:
            start = (T - frames)//2 if center else np.random.randint(0, T - frames + 1)
            x = x[:, start:start+frames]
        else:
            pad = frames - T
            x = np.pad(x, ((0,0), (pad//2, pad - pad//2)), mode="constant")

        # Robust winsorize + standardize
        lo, hi = np.percentile(x, [0.5, 99.5]).astype(np.float32)
        x = np.clip(x, lo, hi)

        m = float(np.mean(x, dtype=np.float64))
        s = float(np.std(x,  dtype=np.float64))
        if not np.isfinite(s) or s < 1e-6:
            return None
        x = (x - m) / s
        x = np.clip(x, -10, 10)

        if not np.isfinite(x).all():
            return None

        # [C=1, F, T]
        return x.astype(np.float32, copy=False)[None, ...]

    except Exception:
        return None


class FastAudioDS(Dataset):
    def __init__(self, df: pd.DataFrame, targets: List[str], center: bool = False):
        self.df = df.reset_index(drop=True); self.targets = targets; self.center = center
    def __len__(self): return len(self.df)
    def __getitem__(self, i: int) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
        r = self.df.iloc[i]
        x = load_feature(r["feature_path"], center=self.center)
        if x is None: return None
        y = np.array([float(r[t]) for t in self.targets], dtype=np.float32)
        if not np.isfinite(y).all(): return None
        return torch.from_numpy(x), torch.from_numpy(y)

def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    if not batch: return None
    X, y = zip(*batch); return torch.stack(X,0), torch.stack(y,0)

# ---------- DataLoader factory (fixes your error) ----------
# If NUM_WORKERS == 0 -> DO NOT pass prefetch_factor/persistent_workers
if not USE_CUDA:
    BATCH_SIZE = min(BATCH_SIZE, 16)
NUM_WORKERS = max(2, os.cpu_count()//2) if USE_CUDA else 0
PIN = USE_CUDA

def make_loader(dataset, shuffle, drop_last, workers=NUM_WORKERS):
    kwargs = dict(batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=workers,
                  pin_memory=PIN, collate_fn=collate_fn, drop_last=drop_last)
    if workers > 0:
        kwargs.update(dict(persistent_workers=True, prefetch_factor=4))
    return DataLoader(dataset, **kwargs)

print("\nCreating train/val datasets (fast & robust)…")
train_ds = FastAudioDS(df_train, TARGETS, center=False)
val_ds   = FastAudioDS(df_val,   TARGETS, center=True)

train_loader = make_loader(train_ds, shuffle=True,  drop_last=True)
val_loader   = make_loader(val_ds,   shuffle=False, drop_last=False)
print(f"✓ Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

# ---------- Model ----------
class ConvBlock(nn.Module):
    def __init__(self, cin, cout, k=(3,7), s=(1,1), p=None):
        super().__init__()
        p = p or (k[0]//2, k[1]//2)
        self.conv = nn.Conv2d(cin, cout, k, s, p); self.bn = nn.BatchNorm2d(cout); self.act = nn.SiLU()
    def forward(self, x): return self.act(self.bn(self.conv(x)))

class MultiTaskModel(nn.Module):
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 256, 512]
        self.stem = ConvBlock(1, C[0], (3,7))
        self.b1   = ConvBlock(C[0], C[1], (3,5))
        self.b2   = ConvBlock(C[1], C[2], (3,5))
        self.b3   = ConvBlock(C[2], C[3], (3,3))
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.head = nn.Sequential(nn.Flatten(), nn.Linear(C[3], 256), nn.SiLU(), nn.Dropout(0.2), nn.Linear(256, n_targets))
    def forward(self, x):
        x = self.stem(x); x = self.b1(x); x = nn.functional.max_pool2d(x, 2)
        x = self.b2(x);  x = nn.functional.max_pool2d(x, 2)
        x = self.b3(x);  x = self.pool(x)
        return self.head(x)

model = MultiTaskModel(n_targets=len(TARGETS)).to(DEVICE)
if USE_CUDA: model = model.to(memory_format=torch.channels_last)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3)
scaler = GradScaler(device="cuda") if AMP_EN else None

def to_dev(X, y):
    if USE_CUDA:
        X = X.to(DEVICE, non_blocking=True).to(memory_format=torch.channels_last)
        y = y.to(DEVICE, non_blocking=True)
    else:
        X, y = X.to(DEVICE), y.to(DEVICE)
    return X, y

@torch.no_grad()
def evaluate():
    model.eval(); losses = []
    for batch in val_loader:
        if batch is None: continue
        X, y = to_dev(*batch)
        with autocast(device_type="cuda", enabled=AMP_EN):
            pred = model(X); loss = criterion(pred, y)
        if torch.isfinite(loss): losses.append(loss.item())
    return float(np.mean(losses)) if losses else float("inf")

# ---------- Train (auto-save best) ----------
print("\nTraining…"); print("="*70)
best_val, history = float("inf"), []
ckpt_path = OUTDIR / "best_model_robust.pt"

for epoch in range(1, EPOCHS+1):
    t0 = time.time(); model.train(); batch_losses = []
    for batch in make_loader(train_ds, shuffle=True, drop_last=True):  # fresh shuffles
        if batch is None: continue
        X, y = to_dev(*batch)
        optimizer.zero_grad(set_to_none=True)
        if AMP_EN:
            with autocast(device_type="cuda", enabled=True):
                pred = model(X); loss = criterion(pred, y)
            if not torch.isfinite(loss): continue
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer); scaler.update()
        else:
            pred = model(X); loss = criterion(pred, y)
            if not torch.isfinite(loss): continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()
        batch_losses.append(loss.item())

    train_loss = float(np.mean(batch_losses)) if batch_losses else float("inf")
    val_loss = evaluate(); scheduler.step(val_loss)
    print(f"Epoch {epoch:02d} | Train {train_loss:.4f} | Val {val_loss:.4f} | {time.time()-t0:.0f}s")
    history.append({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss})

    if np.isfinite(val_loss) and val_loss < best_val:
        best_val = val_loss
        state = {k: v.detach().cpu() for k,v in model.state_dict().items()}
        torch.save({
            "model_state_dict": state,
            "targets": TARGETS,
            "target_stats": target_stats,
            "best_val_loss": best_val,
            "epoch": epoch,
            "config": {"mel_bins": MEL_BINS, "frames": FRAMES,
                       "batch_size": BATCH_SIZE, "lr": LR, "weight_decay": WD}
        }, ckpt_path)
        print(f"  ✓ New best {best_val:.4f} — saved to {ckpt_path}")

# Save history + update config
pd.DataFrame(history).to_csv(OUTDIR / "training_history_robust.csv", index=False)
config.update({
    "best_val_loss": best_val,
    "targets": TARGETS,
    "target_stats": target_stats,
    "checkpoint_path": str(ckpt_path),
})
with open(OUTDIR / "config.json", "w") as f: json.dump(config, f, indent=2)

print("\n✅ Training complete")
print(f"Best val loss: {best_val:.4f}")
print(f"Best model: {ckpt_path}")


CELL 6 — Fast, Robust Trainer
Using master: /content/models/run_01/META_master_postenrich.parquet
✓ Loaded 9,565 master rows
✓ Targets: ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
✓ Train: 8,130 | Val: 1,435

Creating train/val datasets (fast & robust)…
✓ Train batches: 169 | Val batches: 30

Training…


KeyboardInterrupt: 

In [None]:
# In a new cell:
import pandas as pd
df = pd.read_parquet("/content/models/run_01/META_master_postenrich.parquet")

print("Target statistics:")
for col in ['user_engagement_signal', 'platform_quality_signal', 'quality_final']:
    print(f"\n{col}:")
    print(df[col].describe())

Target statistics:

user_engagement_signal:
count    9.565000e+03
mean     9.571639e-09
std      1.000001e+00
min     -4.611747e-01
25%     -3.681757e-01
50%     -2.923262e-01
75%     -1.100938e-01
max      4.270350e+00
Name: user_engagement_signal, dtype: float64

platform_quality_signal:
count    9.565000e+03
mean    -6.381093e-09
std      1.000001e+00
min     -2.991015e+00
25%     -4.720244e-01
50%      1.550676e-01
75%      7.101693e-01
max      1.763698e+00
Name: platform_quality_signal, dtype: float64

quality_final:
count    9.565000e+03
mean     3.988183e-09
std      4.167075e-01
min     -1.336446e+00
25%     -2.801811e-01
50%     -7.660054e-03
75%      2.961998e-01
max      1.284194e+00
Name: quality_final, dtype: float64


In [None]:
# ======================================================================
# CELL 8.5 — Model loader (FIXED to match Cell 6 architecture)
# ======================================================================
from pathlib import Path
import torch, torch.nn as nn
import numpy as np

# --------------------------- discover test audio ---------------------------
DRIVE_FOLDER = Path("/content/drive/MyDrive/STA 160/test")
candidates = sorted(
    [*DRIVE_FOLDER.glob("*.mp3"), *DRIVE_FOLDER.glob("*.wav"),
     *DRIVE_FOLDER.glob("*.m4a"), *DRIVE_FOLDER.glob("*.flac"),
     *DRIVE_FOLDER.glob("*.ogg")]
)
test_files = [p for p in candidates if not p.name.endswith(".crdownload") and p.stat().st_size > 0]
song_names = [p.stem[:80] for p in test_files]

print("Testing these files:")
for i, (p, name) in enumerate(zip(test_files, song_names), 1):
    print(f"  {i}. ✓ {name} ({p.suffix.lower()} {p.stat().st_size/1_048_576:.1f} MB)")

assert len(test_files) >= 2, (
    "Need at least 2 finalized audio files in /STA 160/test folder."
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ckpt_path = Path("/content/models/run_01/best_model_robust.pt")

# =============== MODEL ARCHITECTURES (including Cell 6's MultiTaskModel) ===============

# --- Cell 6 Architecture (THE ONE YOU ACTUALLY TRAINED) ---
class ConvBlock(nn.Module):
    def __init__(self, cin, cout, k=(3,7), s=(1,1), p=None):
        super().__init__()
        p = p or (k[0]//2, k[1]//2)
        self.conv = nn.Conv2d(cin, cout, k, s, p)
        self.bn = nn.BatchNorm2d(cout)
        self.act = nn.SiLU()
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class MultiTaskModel(nn.Module):
    """This is the EXACT architecture from Cell 6"""
    def __init__(self, n_targets):
        super().__init__()
        C = [32, 64, 128, 192]
        self.stem = ConvBlock(1, C[0], (3,7))
        self.b1   = ConvBlock(C[0], C[1], (3,5))
        self.b2   = ConvBlock(C[1], C[2], (3,5))
        self.b3   = ConvBlock(C[2], C[3], (3,3))
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(C[3], 256),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(256, n_targets)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x)
        x = nn.functional.max_pool2d(x, 2)
        x = self.b2(x)
        x = nn.functional.max_pool2d(x, 2)
        x = self.b3(x)
        x = self.pool(x)
        return self.head(x)

# --- Legacy architectures (keep for backwards compatibility) ---
class LegacyConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch, k=(3,3), p=(1,1)):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=k, padding=p)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class AudioBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.stem   = LegacyConvBlock(1,   32, (3,7), (1,3))
        self.block1 = LegacyConvBlock(32,  64, (3,5), (1,2))
        self.block2 = LegacyConvBlock(64, 128, (3,5), (1,2))
        self.block3 = LegacyConvBlock(128,192, (3,3), (1,1))
        self.pool   = nn.AdaptiveAvgPool2d((1,1))
    def forward(self, x):
        x = self.stem(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        return self.pool(x).flatten(1)

class MultiTask_Backbone(nn.Module):
    def __init__(self, n_targets):
        super().__init__()
        self.backbone = AudioBackbone()
        self.heads = nn.ModuleList([
            nn.Sequential(nn.Linear(192,128), nn.ReLU(True), nn.Dropout(0.3), nn.Linear(128,1))
            for _ in range(n_targets)
        ])
    def forward(self, x):
        f = self.backbone(x)
        outs = [h(f).squeeze(1) for h in self.heads]
        return torch.stack(outs, dim=1)

# --------------------------- safe checkpoint load ---------------------------
def load_checkpoint_safe(path, device):
    """Try weights_only=True, then allowlist numpy, finally fallback"""
    try:
        return torch.load(path, map_location=device, weights_only=True)
    except Exception:
        try:
            from torch.serialization import add_safe_globals
            add_safe_globals([np._core.multiarray.scalar, np.dtype])
            return torch.load(path, map_location=device, weights_only=True)
        except Exception:
            print("[warn] Falling back to weights_only=False (trusted local checkpoint).")
            return torch.load(path, map_location=device, weights_only=False)

# ---------- robust checkpoint extraction ----------
def extract_state_targets_cfg(ckpt):
    """Return (state_dict, targets:list[str], cfg:dict)"""
    if isinstance(ckpt, torch.nn.Module):
        return ckpt.state_dict(), ["quality_final"], {}

    if not isinstance(ckpt, dict):
        raise RuntimeError(f"Unsupported checkpoint type: {type(ckpt)}")

    # Find state dict
    candidate_keys = [
        "model_state_dict", "state_dict", "ema_state_dict",
        "model", "net", "weights", "params"
    ]
    state = None
    for k in candidate_keys:
        v = ckpt.get(k)
        if isinstance(v, dict) and all(isinstance(x, torch.Tensor) for x in v.values()):
            state = v
            break

    if state is None and all(isinstance(v, torch.Tensor) for v in ckpt.values()):
        state = ckpt

    if state is None:
        raise RuntimeError("Could not locate model state_dict in checkpoint.")

    # Strip common prefixes
    STRIP_PREFIXES = ("module.", "_orig_mod.", "model.", "net.")
    def _strip(k: str) -> str:
        for p in STRIP_PREFIXES:
            if k.startswith(p):
                return k[len(p):]
        return k
    state = {_strip(k): v for k, v in state.items()}

    targets = ckpt.get("targets") or ckpt.get("target_names") or ["quality_final"]
    cfg = ckpt.get("config") or ckpt.get("cfg") or {}

    return state, targets, cfg

# ---------------------- Load checkpoint ----------------------
print("\n[1/4] Loading checkpoint...")
checkpoint = load_checkpoint_safe(ckpt_path, device)
state, TARGETS, cfg = extract_state_targets_cfg(checkpoint)

# Get config values
DEFAULT_MEL_BINS = 128
DEFAULT_FRAMES   = 768  # Cell 6 uses 768
MEL_BINS = int(cfg.get("mel_bins", DEFAULT_MEL_BINS))
FRAMES   = int(cfg.get("frames", DEFAULT_FRAMES))

print(f"✓ Checkpoint loaded")
print(f"  Targets: {TARGETS}")
print(f"  MEL_BINS: {MEL_BINS}, FRAMES: {FRAMES}")

# ---------------------- Detect architecture ----------------------
def _has_prefix_keys(sdict, prefix: str) -> bool:
    return any(k.startswith(prefix) for k in sdict.keys())

_sample_keys = list(state.keys())[:15]
print(f"\n[DEBUG] Sample checkpoint keys: {_sample_keys}")

# Architecture detection with Cell 6 support
if _has_prefix_keys(state, "stem.") and _has_prefix_keys(state, "head."):
    print("[arch] ✓ Detected Cell 6 MultiTaskModel (stem.* + head.*)")
    model = MultiTaskModel(n_targets=len(TARGETS))
    strict = True
elif _has_prefix_keys(state, "backbone."):
    print("[arch] Detected MultiTask_Backbone")
    model = MultiTask_Backbone(n_targets=len(TARGETS))
    strict = True
else:
    print("[arch] ⚠️ Unknown architecture, trying Cell 6 MultiTaskModel as fallback")
    model = MultiTaskModel(n_targets=len(TARGETS))
    strict = False

# Load weights
missing, unexpected = model.load_state_dict(state, strict=strict)

if missing:
    print(f"[warn] Missing keys ({len(missing)}): {missing[:5]}")
if unexpected:
    print(f"[warn] Unexpected keys ({len(unexpected)}): {unexpected[:5]}")

model.to(device).eval()
print(f"✓ Model loaded: {model.__class__.__name__} on {device}")

# ==================== AUDIO PROCESSING ====================
print("\n[2/4] Setting up audio processing...")

import librosa, tempfile, subprocess, os

SAMPLE_RATE = 16000
N_FFT = 2048
HOP_LENGTH = 512

def _convert_to_wav_ffmpeg(audio_path):
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tmp.close()
    cmd = ["ffmpeg", "-y", "-i", str(audio_path), "-ac", "1", "-ar", str(SAMPLE_RATE), tmp.name]
    try:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        return tmp.name
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"ffmpeg failed: {e}") from e

def safe_load_audio(path):
    try:
        y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
        if y is None or len(y) == 0:
            raise RuntimeError("empty audio")
        return y, sr, "librosa"
    except Exception:
        wav = None
        try:
            wav = _convert_to_wav_ffmpeg(path)
            y, sr = librosa.load(wav, sr=SAMPLE_RATE, mono=True)
            if y is None or len(y) == 0:
                raise RuntimeError("empty audio after ffmpeg")
            return y, sr, "ffmpeg->librosa"
        finally:
            if wav and os.path.exists(wav):
                try:
                    os.unlink(wav)
                except:
                    pass

def make_logmel(y, sr=SAMPLE_RATE):
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=MEL_BINS, power=2.0
    )
    S_db = librosa.power_to_db(S, ref=np.max)

    # Crop/pad to FRAMES
    T = S_db.shape[1]
    if T >= FRAMES:
        start = max(0, (T - FRAMES)//2)
        S_db = S_db[:, start:start+FRAMES]
    else:
        pad = FRAMES - T
        S_db = np.pad(S_db, ((0,0),(pad//2, pad - pad//2)), mode="edge")

    # Normalize
    m = float(np.mean(S_db))
    s = float(np.std(S_db))
    if not np.isfinite(m) or not np.isfinite(s) or s < 1e-6:
        s = 1.0
    Z = (S_db - m) / s
    np.clip(Z, -10, 10, out=Z)

    if not np.isfinite(Z).all():
        raise RuntimeError("non-finite after normalization")

    return torch.from_numpy(Z.astype(np.float32)).unsqueeze(0).unsqueeze(0)

print("✓ Audio processing ready")

# ==================== PREDICT ====================
print("\n[3/4] Predicting quality for your songs...")
print("="*70)

@torch.no_grad()
def predict_one(audio_path: Path):
    y, sr, how = safe_load_audio(str(audio_path))
    x = make_logmel(y, sr).to(device)
    pred = model(x).cpu().numpy()[0]
    return pred, how

results = []
errors  = []

for i, (audio_file, nice_name) in enumerate(zip(test_files, song_names), 1):
    print("\n" + "="*70)
    print(f"SONG {i}: {nice_name}")
    print("="*70)

    try:
        pred, how = predict_one(audio_file)
        print(f"  ✓ Loaded via: {how}")

        for j, t in enumerate(TARGETS):
            val = float(pred[j])
            print(f"\n{t.upper().replace('_',' ')}:\n  Predicted: {val:7.3f}")

        results.append({
            "song": nice_name,
            "file": audio_file.name,
            "pred": pred.tolist()
        })
    except Exception as e:
        print(f"❌ Failed: {audio_file.name} — {e}")
        errors.append((audio_file.name, str(e)))

# ==================== COMPARE ====================
print("\n[4/4] Comparing your songs...")
print("="*70)

if len(results) >= 2:
    a, b = results[0], results[1]
    print(f"\nSong 1: {a['song']}\nSong 2: {b['song']}")

    diffs = []
    for j, t in enumerate(TARGETS):
        v1, v2 = a["pred"][j], b["pred"][j]
        diff = abs(v1 - v2)
        diffs.append(diff)
        print(f"\n{t.upper().replace('_',' ')}:")
        print(f"  Song 1: {v1:7.3f}")
        print(f"  Song 2: {v2:7.3f}")
        print(f"  Diff:   {diff:7.3f}")

    print("\n" + "="*70)
    print(f"AVERAGE DIFFERENCE: {np.mean(diffs):.3f}")

elif len(results) == 1:
    print("\n⚠️ Only 1 song processed successfully — add another file for comparison.")
else:
    print("\n❌ No songs processed — see errors above.")

# Save results
if results:
    import json
    outp = Path("/content/drive/MyDrive/STA 160/models/run_01/custom_songs_test.json")
    outp.parent.mkdir(parents=True, exist_ok=True)
    with open(outp, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\n✓ Results saved: {outp}")

print("\n" + "="*70)
print("✅ EVALUATION COMPLETE!")
print("="*70)

Testing these files:
  1. ✓ Adele_-_Hello_Original_-_yana_enik_(mp3.pm) (.mp3 6.7 MB)
  2. ✓ DJ_Smash_feat._Ridley_-_The_Night_Is_Young_(mp3.pm) (.mp3 6.7 MB)
  3. ✓ 《孤勇者》（《英雄聯盟：雙城之戰》動畫劇集中文主題曲）陳奕迅 Eason Chan [Official MV] (.mp3 4.2 MB)

[1/4] Loading checkpoint...
✓ Checkpoint loaded
  Targets: ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
  MEL_BINS: 128, FRAMES: 768

[DEBUG] Sample checkpoint keys: ['stem.conv.weight', 'stem.conv.bias', 'stem.bn.weight', 'stem.bn.bias', 'stem.bn.running_mean', 'stem.bn.running_var', 'stem.bn.num_batches_tracked', 'b1.conv.weight', 'b1.conv.bias', 'b1.bn.weight', 'b1.bn.bias', 'b1.bn.running_mean', 'b1.bn.running_var', 'b1.bn.num_batches_tracked', 'b2.conv.weight']
[arch] ✓ Detected Cell 6 MultiTaskModel (stem.* + head.*)
✓ Model loaded: MultiTaskModel on cuda

[2/4] Setting up audio processing...
✓ Audio processing ready

[3/4] Predicting quality for your songs...

SONG 1: Adele_-_Hello_Original_-_yana_enik_(mp3.pm)
  ✓ Loade

In [None]:
# ==============================================================================
# CELL 7: Save Model (FIXED - Saves Best Model)
# ==============================================================================

import json
import torch
from pathlib import Path

print("="*70)
print("CELL 7: Save Best Model")
print("="*70)

OUTPUT_DIR = Path("/content/models/run_01")

# Load config
with open(OUTPUT_DIR / "config.json", 'r') as f:
    config = json.load(f)

print(f"\n[1/3] Checking training status...")

# Check if training completed
if 'best_val_loss' not in config:
    print("⚠️  Warning: Training may not have completed")
    print("   Saving current model anyway...")

best_val_loss = config.get('best_val_loss', None)
if best_val_loss:
    print(f"✓ Best validation loss: {best_val_loss:.4f}")

# Get targets and stats
TARGETS = config.get('targets', ['user_engagement_signal', 'platform_quality_signal', 'quality_final'])
target_stats = config.get('target_stats', {})

print(f"✓ Targets: {TARGETS}")
print(f"✓ Target stats loaded: {len(target_stats)} targets")

# Check if model exists in memory
if 'model' not in globals():
    print("\n⚠️  WARNING: Model not found in memory!")
    print("   Did Cell 6 (training) complete?")
    print("   You may need to re-run Cell 6 first.")
    raise NameError("Model not defined. Run Cell 6 first.")

print(f"\n[2/3] Creating checkpoint...")

# Create checkpoint
checkpoint = {
    # Model
    'model_state_dict': model.state_dict(),
    'model_architecture': 'MultiTaskModel',

    # Targets
    'targets': TARGETS,
    'n_targets': len(TARGETS),

    # Statistics (CRITICAL for denormalization!)
    'target_stats': target_stats,

    # Training info
    'best_val_loss': best_val_loss,
    'epochs_trained': config.get('epochs_trained', None),

    # Configuration
    'config': {
        'mel_bins': 128,
        'frames': 1024,
        'batch_size': 32,
        'learning_rate': 1e-3,
        'seed': 2025
    },

    # Metadata
    'training_samples': config.get('final_samples', len(target_stats) if target_stats else None),
}

print("✓ Checkpoint created")
print(f"  Keys: {list(checkpoint.keys())}")

# Verify critical components
critical_keys = ['model_state_dict', 'target_stats', 'targets']
for key in critical_keys:
    if key in checkpoint and checkpoint[key]:
        print(f"  ✓ {key}: present")
    else:
        print(f"  ⚠️  {key}: MISSING or empty!")

print(f"\n[3/3] Saving checkpoint...")

# Save
checkpoint_path = OUTPUT_DIR / "best_model.pt"
torch.save(checkpoint, checkpoint_path)

file_size = checkpoint_path.stat().st_size / (1024 * 1024)
print(f"✓ Saved to: {checkpoint_path}")
print(f"  File size: {file_size:.2f} MB")

if file_size < 0.5:
    print("  ⚠️  Warning: File seems small, may be incomplete")

# Verify checkpoint can be loaded
print("\nVerifying checkpoint...")
try:
    loaded = torch.load(checkpoint_path, map_location='cpu')
    print("✓ Checkpoint can be loaded")
    print(f"  Keys: {list(loaded.keys())}")

    if 'target_stats' in loaded:
        print(f"  ✓ target_stats: {len(loaded['target_stats'])} targets")
        for target, stats in loaded['target_stats'].items():
            print(f"    {target}: mean={stats['mean']:.4f}, std={stats['std']:.4f}")

    if 'model_state_dict' in loaded:
        print(f"  ✓ model_state_dict: {len(loaded['model_state_dict'])} parameters")

except Exception as e:
    print(f"  ✗ Verification failed: {e}")
    raise

# Update config
config['checkpoint_path'] = str(checkpoint_path)
config['checkpoint_saved'] = True

with open(OUTPUT_DIR / "config.json", 'w') as f:
    json.dump(config, f, indent=2)

print("\n" + "="*70)
print("MODEL SAVED SUCCESSFULLY")
print("="*70)
print(f"Location:     {checkpoint_path}")
print(f"Size:         {file_size:.2f} MB")
print(f"Targets:      {TARGETS}")
print(f"Best val loss: {best_val_loss if best_val_loss else 'N/A'}")
print("="*70)

print("\n✅ Cell 7 Complete - Ready for Cell 8 (Testing)")

In [None]:
# Check training history
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("/content/models/run_01")
history_file = OUTPUT_DIR / "training_history.csv"

if history_file.exists():
    history = pd.read_csv(history_file)

    print("COMPLETE TRAINING HISTORY")
    print("="*70)
    print(history.to_string(index=False))

    print("\n" + "="*70)
    print("ANALYSIS")
    print("="*70)

    best_epoch = history.loc[history['val_loss'].idxmin()]
    last_epoch = history.iloc[-1]

    print(f"\nBest Epoch: {int(best_epoch['epoch'])}")
    print(f"  Train Loss: {best_epoch['train_loss']:.4f}")
    print(f"  Val Loss:   {best_epoch['val_loss']:.4f}")

    print(f"\nLast Epoch: {int(last_epoch['epoch'])}")
    print(f"  Train Loss: {last_epoch['train_loss']:.4f}")
    print(f"  Val Loss:   {last_epoch['val_loss']:.4f}")

    print(f"\nSaved Model Val Loss: 0.9822")

    if abs(last_epoch['val_loss'] - 0.9822) < 0.01:
        print("\n⚠️  PROBLEM: Saved LAST epoch model (not best!)")
        print(f"   You should have saved epoch {int(best_epoch['epoch'])} instead")
        print(f"   Lost improvement: {0.9822 - best_epoch['val_loss']:.4f}")
    elif abs(best_epoch['val_loss'] - 0.9822) < 0.01:
        print("\n✓ Saved the BEST epoch model")
    else:
        print("\n🤔 Unclear which epoch was saved")

else:
    print("❌ No training history found")

❌ No training history found


In [None]:
# ==============================================================================
# CELL 8: Comprehensive Testing - Both Approaches
# ==============================================================================

import os
import json
import tempfile
import subprocess
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

print("="*70)
print("CELL 8: Comprehensive Model Testing")
print("="*70)
print("\nThis cell tests TWO approaches:")
print("  1. Quality Signals (normalized, z-scores)")
print("  2. Percentage Scores (0-100%, with Sigmoid)")
print("="*70)

# ==================== SETUP ====================
OUTPUT_DIR = Path("/content/models/run_01")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ==================== APPROACH 1: Quality Signals ====================
print("\n" + "="*70)
print("APPROACH 1: Quality Signal Predictions")
print("="*70)

# Load config
with open(OUTPUT_DIR / "config.json", 'r') as f:
    config = json.load(f)

checkpoint_path = Path(config['checkpoint_path'])
checkpoint = torch.load(checkpoint_path, map_location='cpu')

TARGETS_QUALITY = checkpoint['targets']
target_stats = checkpoint['target_stats']

print(f"✓ Targets: {TARGETS_QUALITY}")
print(f"✓ These are normalized quality signals (z-scores)")

# Model definition (NO Sigmoid)
class ConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel, padding):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=kernel, padding=padding)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.SiLU(inplace=True)
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class AudioBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.stem = ConvBlock(1, 32, (3,7), (1,3))
        self.block1 = ConvBlock(32, 64, (3,5), (1,2))
        self.block2 = ConvBlock(64, 128, (3,5), (1,2))
        self.block3 = ConvBlock(128, 192, (3,3), (1,1))
        self.pool = nn.AdaptiveAvgPool2d((1,1))

    def forward(self, x):
        x = self.stem(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        return self.pool(x).flatten(1)

class MultiTaskModel(nn.Module):
    def __init__(self, n_targets):
        super().__init__()
        self.backbone = AudioBackbone()
        self.heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(192, 128),
                nn.ReLU(inplace=True),
                nn.Dropout(0.3),
                nn.Linear(128, 1)
                # NO Sigmoid!
            )
            for _ in range(n_targets)
        ])

    def forward(self, x):
        features = self.backbone(x)
        outputs = [head(features).squeeze(1) for head in self.heads]
        return torch.stack(outputs, dim=1)

# Load model
model_quality = MultiTaskModel(n_targets=len(TARGETS_QUALITY))
model_quality.load_state_dict(checkpoint['model_state_dict'])
model_quality.to(device)
model_quality.eval()

print(f"✓ Quality signal model loaded")

# ==================== APPROACH 2: Percentage Scores ====================
print("\n" + "="*70)
print("APPROACH 2: Percentage Score Predictions")
print("="*70)

# Try to find a model with Sigmoid heads
MODEL_PATH_PERCENT = None
CANDIDATE_PATHS = [
    OUTPUT_DIR / "best_model_sigmoid.pt",
    OUTPUT_DIR / "phase1_best.pt",
    Path("/content/drive/MyDrive/models/phase1/phase1_model.pth"),
]

for path in CANDIDATE_PATHS:
    if path.exists():
        MODEL_PATH_PERCENT = path
        break

if MODEL_PATH_PERCENT:
    print(f"✓ Found percentage model: {MODEL_PATH_PERCENT}")

    # Model with Sigmoid
    class MultiTaskModelSigmoid(nn.Module):
        def __init__(self, target_names):
            super().__init__()
            self.backbone = AudioBackbone()
            self.heads = nn.ModuleDict({
                name: nn.Sequential(
                    nn.Linear(192, 128),
                    nn.ReLU(inplace=True),
                    nn.Linear(128, 1),
                    nn.Sigmoid()  # Output 0-1
                )
                for name in target_names
            })

        def forward(self, x):
            z = self.backbone(x)
            return {name: head(z).squeeze(-1) for name, head in self.heads.items()}

    try:
        ckpt_percent = torch.load(MODEL_PATH_PERCENT, map_location=device)

        # Extract targets
        TARGETS_PERCENT = ckpt_percent.get('targets', TARGETS_QUALITY)

        model_percent = MultiTaskModelSigmoid(TARGETS_PERCENT)
        state_dict = ckpt_percent.get('model_state_dict', ckpt_percent)
        model_percent.load_state_dict(state_dict, strict=False)
        model_percent.to(device)
        model_percent.eval()

        print(f"✓ Percentage model loaded")
        print(f"  Targets: {TARGETS_PERCENT}")
        HAS_PERCENT_MODEL = True
    except Exception as e:
        print(f"⚠️  Could not load percentage model: {e}")
        HAS_PERCENT_MODEL = False
else:
    print("⚠️  No percentage model found (optional)")
    HAS_PERCENT_MODEL = False

# ==================== AUDIO PREPROCESSING ====================
print("\n" + "="*70)
print("Setting up audio preprocessing...")
print("="*70)

MEL_BINS = 128
FRAMES = 1024

def load_feature(filepath):
    """Load preprocessed feature file"""
    if str(filepath).endswith('.npy'):
        arr = np.load(filepath, allow_pickle=False)
    else:
        with np.load(filepath, allow_pickle=False) as data:
            arr = None
            for key in ['logmel', 'log_mel', 'mel', 'features', 'x', 'S']:
                if key in data:
                    arr = data[key]
                    break
            if arr is None:
                arr = data[data.files[0]]

    if arr.ndim == 3 and 1 in arr.shape:
        arr = arr.squeeze()

    if arr.shape[0] != MEL_BINS and arr.shape[1] == MEL_BINS:
        arr = arr.T

    if arr.shape[1] > FRAMES:
        start = (arr.shape[1] - FRAMES) // 2
        arr = arr[:, start:start+FRAMES]
    elif arr.shape[1] < FRAMES:
        pad = FRAMES - arr.shape[1]
        arr = np.pad(arr, ((0, 0), (pad//2, pad - pad//2)), mode='constant')

    mean = arr.mean()
    std = arr.std() + 1e-6
    arr = (arr - mean) / std

    return torch.from_numpy(arr.astype(np.float32)).unsqueeze(0).unsqueeze(0)

print("✓ Preprocessing ready")

# ==================== PREDICTION FUNCTIONS ====================

@torch.no_grad()
def predict_quality_signals(feature_path):
    """Predict quality signals (normalized)"""
    x = load_feature(feature_path).to(device)
    pred_norm = model_quality(x).cpu().numpy()[0]

    # Denormalize
    pred_real = {}
    for i, target in enumerate(TARGETS_QUALITY):
        mean = target_stats[target]['mean']
        std = target_stats[target]['std']
        pred_real[target] = pred_norm[i] * std + mean

    return pred_norm, pred_real

@torch.no_grad()
def predict_percentage_scores(feature_path):
    """Predict percentage scores (0-100%)"""
    if not HAS_PERCENT_MODEL:
        return None

    x = load_feature(feature_path).to(device)
    out = model_percent(x)

    # Convert to percentage
    return {t: float(v.item()) * 100.0 for t, v in out.items()}

# ==================== DISPLAY FUNCTIONS ====================

def display_quality_predictions(filename, pred_norm, pred_real, actual=None):
    """Display quality signal predictions"""
    print("\n" + "="*70)
    print(f"QUALITY SIGNALS: {filename}")
    print("="*70)

    for target in pred_real.keys():
        print(f"\n{target.upper().replace('_', ' ')}:")
        print("-" * 70)

        value = pred_real[target]
        print(f"  Predicted: {value:7.3f}")

        if actual and target in actual:
            print(f"  Actual:    {actual[target]:7.3f}")
            error = abs(value - actual[target])
            print(f"  Error:     {error:7.3f}")

        # Interpretation (z-score)
        if value > 1.5:
            print(f"  → Exceptional (top 7%) ⭐⭐⭐⭐⭐")
        elif value > 0.5:
            print(f"  → Above average (top 31%) ⭐⭐⭐⭐")
        elif value > -0.5:
            print(f"  → Average (middle 38%) ⭐⭐⭐")
        elif value > -1.5:
            print(f"  → Below average (bottom 31%) ⭐⭐")
        else:
            print(f"  → Poor (bottom 7%) ⭐")

def display_percentage_predictions(filename, scores):
    """Display percentage predictions"""
    if not scores:
        return

    print("\n" + "="*70)
    print(f"PERCENTAGE SCORES: {filename}")
    print("="*70)

    labels = {
        "user_engagement_signal": "👤 User Engagement",
        "platform_quality_signal": "🎯 Platform Quality",
        "relative_quality_signal": "⭐ Relative Quality",
        "quality_final": "🏆 Overall Quality",
    }

    for target, value in scores.items():
        bar = "█" * int(value/5) + "░" * (20 - int(value/5))
        label = labels.get(target, target)
        print(f"{label:30s} [{bar}] {value:5.1f}%")

# ==================== FIND TEST FILES ====================
print("\n" + "="*70)
print("Finding test files...")
print("="*70)

master_file = config['master_file']
master = pd.read_parquet(master_file)

# Select 2 test samples
np.random.seed(42)
test_indices = np.random.choice(len(master), size=min(2, len(master)), replace=False)
test_samples = master.iloc[test_indices].reset_index(drop=True)

print(f"✓ Selected {len(test_samples)} test files")

# ==================== RUN PREDICTIONS ====================
print("\n" + "="*70)
print("RUNNING PREDICTIONS")
print("="*70)

all_results = []

for i, row in test_samples.iterrows():
    print(f"\n{'='*70}")
    print(f"TEST FILE {i+1}/{len(test_samples)}")
    print(f"{'='*70}")
    print(f"Video ID: {row['video_id']}")
    print(f"File: {Path(row['feature_path']).name}")

    try:
        # Approach 1: Quality signals
        pred_norm, pred_real = predict_quality_signals(row['feature_path'])
        actual = {t: row[t] for t in TARGETS_QUALITY if t in row.index}
        display_quality_predictions(
            Path(row['feature_path']).name,
            pred_norm,
            pred_real,
            actual if actual else None
        )

        # Approach 2: Percentage scores
        if HAS_PERCENT_MODEL:
            percent_scores = predict_percentage_scores(row['feature_path'])
            display_percentage_predictions(
                Path(row['feature_path']).name,
                percent_scores
            )

        # Store results
        result = {
            'video_id': row['video_id'],
            'filename': Path(row['feature_path']).name,
            'timestamp': datetime.now().isoformat()
        }

        # Add quality predictions
        for target in TARGETS_QUALITY:
            result[f'quality_{target}'] = pred_real[target]
            if target in actual:
                result[f'actual_{target}'] = actual[target]

        # Add percentage predictions
        if HAS_PERCENT_MODEL and percent_scores:
            for target, value in percent_scores.items():
                result[f'percent_{target}'] = value

        all_results.append(result)

    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

# ==================== SAVE RESULTS ====================
print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

if all_results:
    df_results = pd.DataFrame(all_results)

    # Save CSV
    results_csv = OUTPUT_DIR / "test_predictions_both.csv"
    df_results.to_csv(results_csv, index=False)
    print(f"✓ CSV saved: {results_csv}")

    # Convert numpy types to Python types for JSON
    results_for_json = []
    for result in all_results:
        clean_result = {}
        for key, value in result.items():
            if isinstance(value, (np.float32, np.float64)):
                clean_result[key] = float(value)
            elif isinstance(value, (np.int32, np.int64)):
                clean_result[key] = int(value)
            else:
                clean_result[key] = value
        results_for_json.append(clean_result)

    # Save JSON
    results_json = OUTPUT_DIR / "test_predictions_both.json"
    with open(results_json, 'w') as f:
        json.dump(results_for_json, f, indent=2)
    print(f"✓ JSON saved: {results_json}")

# ==================== COMPARISON ====================
print("\n" + "="*70)
print("COMPARISON: Quality Signals vs Percentage Scores")
print("="*70)

print("""
APPROACH 1: Quality Signals (What you just trained)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✓ Normalized z-scores (mean=0, std=1)
✓ Can be positive or negative
✓ Interpretable: >1.5 = exceptional, 0 = average, <-1.5 = poor
✓ Better for comparing songs relatively
✓ No artificial constraints

Example output:
  user_engagement_signal:  0.85  (above average)
  platform_quality_signal: 1.23  (exceptional)
  quality_final:           1.05  (high quality)

APPROACH 2: Percentage Scores (Alternative approach)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✓ Constrained to 0-100% (using Sigmoid)
✓ Always positive
✓ More intuitive for end users
✓ Like a "score card"
✓ But may not capture extremes well

Example output:
  user_engagement:  72.5%
  platform_quality: 85.3%
  quality_final:    78.9%

WHICH IS BETTER?
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Quality Signals (what you have) are BETTER for:
  ✓ Ranking songs
  ✓ Finding outliers
  ✓ Statistical analysis
  ✓ Model training

Percentage Scores are BETTER for:
  ✓ User-facing dashboards
  ✓ Intuitive interpretation
  ✓ Business presentations

You can always CONVERT between them:
  percentage = (quality_signal + 3) / 6 * 100  # Rough conversion
""")

# ==================== FINAL VERIFICATION ====================
print("\n" + "="*70)
print("VERIFICATION")
print("="*70)

all_good = True
for result in all_results:
    for key, value in result.items():
        if key.startswith('quality_') and abs(value) < 0.001:
            print(f"⚠️  {result['video_id']}: {key} is near zero")
            all_good = False

if all_good:
    print("✅ ALL QUALITY PREDICTIONS ARE NON-ZERO!")
    print("\n✓ Model trained successfully")
    print("✓ Predictions are meaningful")
    print("✓ Different songs give different scores")
else:
    print("⚠️  Some predictions near zero - check normalization")

print("\n" + "="*70)
print("🎉 TESTING COMPLETE!")
print("="*70)
print(f"\nTested {len(all_results)} files")
print(f"Results: {results_csv if 'results_csv' in locals() else 'N/A'}")
print("\nYour model is ready!")
print("="*70)

print("\n✅ Cell 8 Complete - Pipeline Finished!")

CELL 8: Comprehensive Model Testing

This cell tests TWO approaches:
  1. Quality Signals (normalized, z-scores)
  2. Percentage Scores (0-100%, with Sigmoid)

APPROACH 1: Quality Signal Predictions
✓ Targets: ['user_engagement_signal', 'platform_quality_signal', 'quality_final']
✓ These are normalized quality signals (z-scores)


RuntimeError: Error(s) in loading state_dict for MultiTaskModel:
	Missing key(s) in state_dict: "backbone.stem.conv.weight", "backbone.stem.conv.bias", "backbone.stem.bn.weight", "backbone.stem.bn.bias", "backbone.stem.bn.running_mean", "backbone.stem.bn.running_var", "backbone.block1.conv.weight", "backbone.block1.conv.bias", "backbone.block1.bn.weight", "backbone.block1.bn.bias", "backbone.block1.bn.running_mean", "backbone.block1.bn.running_var", "backbone.block2.conv.weight", "backbone.block2.conv.bias", "backbone.block2.bn.weight", "backbone.block2.bn.bias", "backbone.block2.bn.running_mean", "backbone.block2.bn.running_var", "backbone.block3.conv.weight", "backbone.block3.conv.bias", "backbone.block3.bn.weight", "backbone.block3.bn.bias", "backbone.block3.bn.running_mean", "backbone.block3.bn.running_var", "heads.0.0.weight", "heads.0.0.bias", "heads.0.3.weight", "heads.0.3.bias", "heads.1.0.weight", "heads.1.0.bias", "heads.1.3.weight", "heads.1.3.bias", "heads.2.0.weight", "heads.2.0.bias", "heads.2.3.weight", "heads.2.3.bias". 
	Unexpected key(s) in state_dict: "stem.conv.weight", "stem.conv.bias", "stem.bn.weight", "stem.bn.bias", "stem.bn.running_mean", "stem.bn.running_var", "stem.bn.num_batches_tracked", "b1.conv.weight", "b1.conv.bias", "b1.bn.weight", "b1.bn.bias", "b1.bn.running_mean", "b1.bn.running_var", "b1.bn.num_batches_tracked", "b2.conv.weight", "b2.conv.bias", "b2.bn.weight", "b2.bn.bias", "b2.bn.running_mean", "b2.bn.running_var", "b2.bn.num_batches_tracked", "b3.conv.weight", "b3.conv.bias", "b3.bn.weight", "b3.bn.bias", "b3.bn.running_mean", "b3.bn.running_var", "b3.bn.num_batches_tracked", "head.1.weight", "head.1.bias", "head.4.weight", "head.4.bias". 