In [None]:
import sys, subprocess, importlib

def ensure(mod_name, pip_spec=None):
    try:
        importlib.import_module(mod_name)
        print(f"OK: {mod_name}")
    except ImportError:
        pkg = pip_spec or mod_name
        print(f"Installing: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Core scientific stack
ensure("numpy", "numpy>=1.26")
ensure("pandas", "pandas>=2.2")
ensure("tqdm", "tqdm>=4.66")

# Deep learning + NLP
ensure("torch", "torch>=2.2")
ensure("transformers", "transformers>=4.41")
ensure("accelerate", "accelerate>=0.30")  # optional, helps large models

# Re-check and print versions for reproducibility
import numpy, pandas, transformers
tqdm_mod = importlib.import_module("tqdm")    # module used ONLY for version
from tqdm.auto import tqdm as tqdm             # function handle used in loops

try:
    import torch
    torch_v = torch.__version__
except Exception:
    torch_v = "not importable"

print({
    "numpy": numpy.__version__,
    "pandas": pandas.__version__,
    "tqdm": tqdm_mod.__version__,
    "transformers": transformers.__version__,
    "torch": torch_v
})
print("tqdm callable =", callable(tqdm))


# =========================
# Imports and configuration
# =========================
import re
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from transformers import pipeline

# Reproducibility and clean logs
np.random.seed(42)
torch.manual_seed(42)
warnings.filterwarnings("ignore", category=FutureWarning)

# Project configuration kept in one place for auditability
CONFIG = {
    "DATE_START": "2021-01-01",
    "DATE_END": "2023-12-31",
    "TICKERS": ["AAPL", "AMZN", "MSFT", "TSLA", "AMD"],
    "SRC_TW": "informal_twitter_stock_tweets.csv",
    "SRC_RD": "informal_reddit_wsb_posts.csv",
    "SRC_SP": "formal_sp500_company_headlines.csv",
    "PX_TIDY": "historical_stock_yfinance_full_2021_2023_tidy.csv",
    "OUT_DIR": "final_inputs",
    "INT_DIR": "intermediates",
    "SINGLE_CASHTAG_ONLY": True,   # enforce single-cashtag across ALL sources
    "BATCH_CUDA": 64,
    "BATCH_CPU": 8,
    "MAXLEN_CUDA": 96,
    "MAXLEN_CPU": 64,

    # Models: proposal parity is FinBERT + DeBERTa + Falcon.
    # DistilBERT is acceptable if documented; swap to Falcon if you want verbatim parity.
    "MODEL_FINBERT": "ProsusAI/finbert",
    "MODEL_DEBERTA": "mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",
    "MODEL_DISTIL": "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
    # e.g. "MODEL_FALCON": "<your-falcon-checkpoint>"
}

BASE = Path(".")
INT_DIR = BASE / CONFIG["INT_DIR"]
OUT_DIR = BASE / CONFIG["OUT_DIR"]
INT_DIR.mkdir(exist_ok=True, parents=True)
OUT_DIR.mkdir(exist_ok=True, parents=True)

DATE_START_TS = pd.to_datetime(CONFIG["DATE_START"])
DATE_END_TS = pd.to_datetime(CONFIG["DATE_END"])
TICKERS = CONFIG["TICKERS"]

device_id = 0 if torch.cuda.is_available() else -1
BATCH = CONFIG["BATCH_CUDA"] if device_id == 0 else CONFIG["BATCH_CPU"]
MAXLEN = CONFIG["MAXLEN_CUDA"] if device_id == 0 else CONFIG["MAXLEN_CPU"]

# NLP pipelines
finbert = pipeline(
    "sentiment-analysis",
    model=CONFIG["MODEL_FINBERT"],
    tokenizer=CONFIG["MODEL_FINBERT"],
    device=device_id,
)
deberta = pipeline(
    "sentiment-analysis",
    model=CONFIG["MODEL_DEBERTA"],
    device=device_id,
)
distilb = pipeline(
    "sentiment-analysis",
    model=CONFIG["MODEL_DISTIL"],
    device=device_id,
)

# Columns that commonly appear in raw sources (expanded for robustness)
DATE_COLS = [
    "date","datetime","timestamp","time","published_at","created","created_at",
    "created time","posted_at","post_timestamp"
]
TEXT_COLS = ["headline","title","text","content","body","description","tweet","post","message"]
TICKER_COLS = ["ticker","symbol","stock","root_ticker","stock name","company","company_name"]
AUTHOR_COLS = [
    "author","username","user_name","screen_name","handle","name",
    "user","user.screen_name","user.username","user.name","byline","poster","source",
    "user_handle","profile","account","created_by"
]

# Map common surface names to target tickers for fallback when cashtags are missing
NAME_MAP = {
    "apple": "AAPL",
    "amazon": "AMZN",
    "microsoft": "MSFT",
    "tesla": "TSLA",
    "advanced micro devices": "AMD",
    "amd": "AMD",
}

# Run log collector
RUN_LOG = {"config": CONFIG, "sources": {}}


# =========================
# Helper functions
# =========================

def _normkey(s: str) -> str:
    return re.sub(r"[^a-z0-9]", "", str(s).lower())

def pick(cols, candidates):
    """
    Pick the first matching column from candidate names (case-insensitive),
    tolerant to underscores, dots, spaces, etc.
    """
    norm = {_normkey(c): c for c in cols}
    for cand in candidates:
        key = _normkey(cand)
        if key in norm:
            return norm[key]
    return None

def clean_text(x):
    """Basic normalisation and ASCII clean to reduce noise."""
    if not isinstance(x, str):
        return ""
    x = re.sub(r"http\S+|www\.\S+", " ", x)
    x = re.sub(r"<[^>]+>", " ", x)
    x = x.encode("ascii", "ignore").decode("ascii")
    x = re.sub(r"\s+", " ", x).strip()
    return x

def norm_headline(x):
    """Canonical form for de-duplication at item level."""
    x = clean_text(x).lower()
    x = re.sub(r"[^a-z0-9\s$]", " ", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

def extract_cashtags(x):
    if not isinstance(x, str):
        return []
    # unique, upper-case cashtags
    return list({t.upper() for t in re.findall(r"\$([A-Za-z]{1,5})", x)})

def map_company_names(s):
    if not isinstance(s, str):
        return None
    t = s.lower()
    for k, v in NAME_MAP.items():
        if k in t:
            return v
    return None

def to_text_list(seq):
    """Ensure a list of clean strings without 'nan' leaks."""
    out = []
    for t in seq:
        if t is None:
            out.append("")
        elif isinstance(t, float) and np.isnan(t):
            out.append("")
        else:
            s = str(t)
            out.append("" if s.lower() == "nan" else s)
    return out

def parse_to_et(ts_series):
    """
    Parse timestamps robustly and convert to America/New_York.
    Tries UTC-aware parse first; if that fails, strips common TZ suffixes and localises to ET.
    """
    s = pd.Series(ts_series).astype(str)
    dt_utc = pd.to_datetime(s, errors="coerce", utc=True)
    m = dt_utc.isna()
    if m.any():
        s2 = s[m].str.replace(r"\s*(ET|EST|EDT|BST|GMT|UTC)\s*$", "", regex=True)
        dt_local = pd.to_datetime(s2, errors="coerce")
        dt_local = dt_local.dt.tz_localize("America/New_York", ambiguous="infer", nonexistent="shift_forward")
        dt_utc.loc[m] = dt_local.dt.tz_convert("UTC")
    return dt_utc.dt.tz_convert("America/New_York")

def build_trading_days():
    """Use price file as the single source of truth for trading days."""
    px = pd.read_csv(CONFIG["PX_TIDY"])
    px["Date"] = pd.to_datetime(px["Date"]).dt.tz_localize(None).dt.floor("D")
    px = px[(px["Date"] >= DATE_START_TS) & (px["Date"] <= DATE_END_TS)]
    px = px[px["ticker"].isin(TICKERS)]
    days = np.array(sorted(px["Date"].unique()))
    return days

TRADING_DAYS = build_trading_days()

def next_trading_day_array(dates_array, d):
    """Snap to the same or next available trading day in the calendar."""
    i = dates_array.searchsorted(d)
    if i < len(dates_array) and dates_array[i] == d:
        return dates_array[i]
    if i < len(dates_array):
        return dates_array[i]
    return dates_array[-1]

def assign_trading_date(ts_et):
    """Map after 16:00 ET posts to next trading day to avoid lookahead."""
    d = ts_et.dt.tz_localize(None)
    base = d.dt.floor("D")
    after_close = d.dt.hour * 60 + d.dt.minute >= 16 * 60
    base = base.where(~after_close, base + pd.Timedelta(days=1))
    mapped = base.apply(lambda x: next_trading_day_array(TRADING_DAYS, x))
    return pd.to_datetime(mapped)

def _safe_call(clf, texts, title):
    """
    Call a pipeline defensively. Use top_k=None (full score list) to avoid
    return_all_scores deprecation warning. On failure, return neutral scores.
    """
    try:
        # top_k=None yields per-class (label, score) dicts
        return clf(texts, truncation=True, padding=True, max_length=MAXLEN, top_k=None)
    except Exception as e:
        model_name = getattr(getattr(clf, "model", None), "name_or_path", "?")
        print(f"[WARN] {title}: batch of {len(texts)} failed in {model_name}. Error: {e}. Returning neutral.")
        # Neutral distribution for 3-class models; binary models will still be parsed safely below
        neutral = [{"label": "positive", "score": 0.0},
                   {"label": "neutral", "score": 1.0},
                   {"label": "negative", "score": 0.0}]
        return [neutral] * len(texts)

def score_finbert(texts, title):
    """Return sentiment score (pos-neg) and class probabilities for FinBERT."""
    if len(texts) == 0:
        z = np.zeros(0, dtype=float)
        return z, z, z, z
    texts = to_text_list(texts)
    s = np.zeros(len(texts), dtype=float)
    ppos = np.zeros(len(texts), dtype=float)
    pneu = np.zeros(len(texts), dtype=float)
    pneg = np.zeros(len(texts), dtype=float)
    for i in tqdm(range(0, len(texts), BATCH), desc=title):
        chunk = texts[i:i+BATCH]
        res = _safe_call(finbert, chunk, title)
        for j, scores in enumerate(res):
            m = {d["label"].lower(): float(d["score"]) for d in scores}
            pos = m.get("positive", 0.0)
            neu = m.get("neutral", 0.0)
            neg = m.get("negative", 0.0)
            s[i + j] = pos - neg
            ppos[i + j] = pos
            pneu[i + j] = neu
            pneg[i + j] = neg
    return s, ppos, pneu, pneg

def score_generic(clf, texts, title):
    """Return (pos-neg) using a generic classifier output."""
    if len(texts) == 0:
        return np.zeros(0, dtype=float)
    texts = to_text_list(texts)
    out = np.zeros(len(texts), dtype=float)
    for i in tqdm(range(0, len(texts), BATCH), desc=title):
        chunk = texts[i:i+BATCH]
        res = _safe_call(clf, chunk, title)
        for j, scores in enumerate(res):
            m = {d["label"].lower(): float(d["score"]) for d in scores}
            # Allow for binary classifiers that use pos/neg labels
            pos = m.get("positive", m.get("pos", 0.0))
            neg = m.get("negative", m.get("neg", 0.0))
            out[i + j] = pos - neg
    return out

def _extract_author_cell(x):
    """
    Extract an author/handle robustly:
    - String as-is
    - If JSON-like, try keys: screen_name, username, user_name, name
    """
    if pd.isna(x):
        return ""
    s = str(x)
    if s.startswith("{") and s.endswith("}"):
        try:
            obj = json.loads(s)
            for k in ("screen_name","username","user_name","name"):
                if k in obj and obj[k]:
                    return str(obj[k])
        except Exception:
            return ""
    return s

def apply_bot_filters(df, log_bucket):
    """Remove low-information and bot-like activity without using future information."""
    df = df.copy()
    before = len(df)

    # Text length >= 5
    df["len"] = df["text"].str.len().fillna(0)
    short_mask = df["len"] < 5
    log_bucket["drop_text_too_short"] = log_bucket.get("drop_text_too_short", 0) + int(short_mask.sum())
    df = df[~short_mask]

    # Cashtag count <= 5 (regex count, not unique)
    df["n_cashtags"] = df["text"].str.count(r"\$[A-Za-z]{1,5}")
    many_tags = df["n_cashtags"] > 5
    log_bucket["drop_too_many_cashtags"] = log_bucket.get("drop_too_many_cashtags", 0) + int(many_tags.sum())
    df = df[~many_tags]

    # Per-author activity filter, if authors present
    if "author" in df.columns:
        a = df["author"].astype(str).str.strip()
        mask = a.ne("")
        if mask.sum() >= 10 and a[mask].nunique() >= 10:
            counts = df.loc[mask].groupby([a[mask], df.loc[mask, "date"].dt.date])["text"].transform("count")
            over_active = counts > 50
            dropped = int(over_active.fillna(False).sum())
            log_bucket["drop_author_activity_gt50"] = log_bucket.get("drop_author_activity_gt50", 0) + dropped
            keep = pd.Series(True, index=df.index)
            keep.loc[mask] = ~over_active
            df = df[keep]

    # Clean up helper cols
    df = df.drop(columns=["len", "n_cashtags"], errors="ignore")
    log_bucket["bot_filter_total_dropped"] = log_bucket.get("bot_filter_total_dropped", 0) + (before - len(df))
    return df

def load_source(path, label):
    """
    Load one raw source, standardise schema, align to trading days, de-duplicate,
    enforce single-cashtag and cashtag-ticker consistency, and collect a run log.
    """
    p = Path(path)
    log = {
        "read_path": str(p),
        "raw_rows": 0,
        "dropped_unparsable_time": 0,
        "dropped_outside_date_window": 0,
        "rolled_after_close": 0,         # informational
        "rolled_weekend_holiday": 0,     # informational
        "filled_ticker_from_name": 0,
        "dropped_multi_cashtag": 0,
        "dropped_cashtag_ticker_mismatch": 0,
        "dropped_non_target_ticker": 0,
        "dropped_duplicates": 0,
        "drop_text_too_short": 0,
        "drop_too_many_cashtags": 0,
        "drop_author_activity_gt50": 0,
        "bot_filter_total_dropped": 0,
        "final_rows": 0
    }

    if not p.exists():
        RUN_LOG["sources"][label] = log
        return pd.DataFrame(columns=["date", "ticker", "text", "author"])

    df = pd.read_csv(p, low_memory=False)
    log["raw_rows"] = len(df)

    dcol = pick(df.columns, DATE_COLS)
    xcol = pick(df.columns, TEXT_COLS)
    tcol = pick(df.columns, TICKER_COLS)
    acol = pick(df.columns, AUTHOR_COLS)

    if dcol is None or xcol is None:
        RUN_LOG["sources"][label] = log
        return pd.DataFrame(columns=["date", "ticker", "text", "author"])

    # Timestamps to ET then trading-day assignment
    ts_et = parse_to_et(df[dcol])
    unparsable = ts_et.isna()
    log["dropped_unparsable_time"] = int(unparsable.sum())

    # Informational roll counts
    naive_day = pd.to_datetime(ts_et.dt.tz_localize(None).dt.floor("D"), errors="coerce")
    after_close = (ts_et.dt.hour * 60 + ts_et.dt.minute) >= 16 * 60
    base = naive_day + pd.to_timedelta(after_close.astype(int), unit="D")
    in_calendar = base.dt.normalize().isin(pd.DatetimeIndex(TRADING_DAYS))
    log["rolled_after_close"] = int(after_close.sum())
    log["rolled_weekend_holiday"] = int((~in_calendar & ~naive_day.isna()).sum())

    date_assigned = assign_trading_date(ts_et)
    in_window = (date_assigned >= DATE_START_TS) & (date_assigned <= DATE_END_TS)
    log["dropped_outside_date_window"] = int((~in_window & ~date_assigned.isna()).sum())

    sub = pd.DataFrame({"date": date_assigned[in_window].to_numpy()})
    txt = pd.Series(to_text_list(df.loc[in_window, xcol].tolist()))
    sub["text"] = txt.map(clean_text)

    # Author (robust extraction)
    if acol is not None:
        raw_auth = df.loc[in_window, acol]
        sub["author"] = raw_auth.map(_extract_author_cell).astype(str).fillna("").reset_index(drop=True)
    else:
        sub["author"] = pd.Series([""] * len(sub))

    # Ticker from source column if present
    if tcol is not None:
        tk = df.loc[in_window, tcol].astype(str).str.upper().str.extract(r"([A-Z.]{1,10})", expand=False)
        sub["ticker"] = tk.values
    else:
        sub["ticker"] = ""

    # Extract cashtags from text for ALL sources
    cas = sub["text"].apply(extract_cashtags)
    sub["n_tags"] = cas.apply(len)
    sub["tag"] = cas.apply(lambda lst: lst[0] if len(lst) == 1 else "")

    # Enforce SINGLE_CASHTAG_ONLY across all sources
    if CONFIG["SINGLE_CASHTAG_ONLY"]:
        multi = sub["n_tags"] > 1
        log["dropped_multi_cashtag"] += int(multi.sum())
        sub = sub[~multi]

    # If ticker missing and exactly one cashtag exists, fill ticker from cashtag
    need_fill = sub["ticker"].astype(str).str.strip().eq("") & sub["tag"].ne("")
    sub.loc[need_fill, "ticker"] = sub.loc[need_fill, "tag"]

    # If still missing ticker, try company-name fallback
    still_missing = sub["ticker"].astype(str).str.strip().eq("")
    fill_from_name = sub.loc[still_missing, "text"].apply(map_company_names)
    fill_mask = still_missing & fill_from_name.notna()
    log["filled_ticker_from_name"] += int(fill_mask.sum())
    sub.loc[fill_mask, "ticker"] = fill_from_name[fill_mask]

    # Drop rows where there is a single cashtag and it DOES NOT match the assigned ticker
    has_single_tag = sub["tag"].ne("")
    mismatch = has_single_tag & sub["ticker"].ne(sub["tag"])
    log["dropped_cashtag_ticker_mismatch"] += int(mismatch.sum())
    sub = sub[~mismatch]

    # Keep only target tickers
    sub["ticker"] = sub["ticker"].astype(str).str.upper()
    non_target = ~sub["ticker"].isin(TICKERS)
    log["dropped_non_target_ticker"] += int(non_target.sum())
    sub = sub[~non_target]

    # De-duplicate on (date, ticker, norm_text)
    sub["norm_text"] = sub["text"].map(norm_headline)
    before_dups = len(sub)
    sub = sub.drop_duplicates(subset=["date", "ticker", "norm_text"], keep="first").drop(columns=["norm_text"])
    log["dropped_duplicates"] = int(before_dups - len(sub))

    # Apply bot/low-information filters with counters
    sub = apply_bot_filters(sub, log)

    # Finalise
    sub = sub[["date", "ticker", "text", "author"]].reset_index(drop=True)
    log["final_rows"] = int(len(sub))
    RUN_LOG["sources"][label] = log
    return sub

def score_items(items, label):
    """Score a standardised items frame with all three models."""
    if items.empty:
        return items.assign(
            s_finbert=pd.Series(dtype=float),
            p_pos=pd.Series(dtype=float),
            p_neu=pd.Series(dtype=float),
            p_neg=pd.Series(dtype=float),
            s_deberta=pd.Series(dtype=float),
            s_distil=pd.Series(dtype=float),
            model_finbert=CONFIG["MODEL_FINBERT"],
            model_deberta=CONFIG["MODEL_DEBERTA"],
            model_distil=CONFIG["MODEL_DISTIL"],
        )
    texts = items["text"].tolist()
    s_f, p_pos, p_neu, p_neg = score_finbert(texts, f"{label} finbert")
    s_d = score_generic(deberta, texts, f"{label} deberta")
    s_t = score_generic(distilb, texts, f"{label} distilbert")
    out = items.copy()
    out["s_finbert"] = s_f
    out["p_pos"] = p_pos
    out["p_neu"] = p_neu
    out["p_neg"] = p_neg
    out["s_deberta"] = s_d
    out["s_distil"] = s_t
    out["model_finbert"] = CONFIG["MODEL_FINBERT"]
    out["model_deberta"] = CONFIG["MODEL_DEBERTA"]
    out["model_distil"] = CONFIG["MODEL_DISTIL"]
    return out


# =========================
# Main
# =========================
print("Loading sources...")
tw_items = load_source(CONFIG["SRC_TW"], "twitter")
rd_items = load_source(CONFIG["SRC_RD"], "reddit")
sp_items = load_source(CONFIG["SRC_SP"], "sp500")

print("Scoring sources...")
tw_scored = score_items(tw_items, "twitter")
rd_scored = score_items(rd_items, "reddit")
sp_scored = score_items(sp_items, "sp500")

# Persist item-level scored data
INT_DIR.mkdir(parents=True, exist_ok=True)
tw_scored.to_csv(INT_DIR / "twitter_scored.csv", index=False)
rd_scored.to_csv(INT_DIR / "reddit_scored.csv", index=False)
sp_scored.to_csv(INT_DIR / "sp500_scored.csv", index=False)

# Write structured run log with reason-coded counts
RUN_LOG["summary"] = {
    "twitter_rows": int(len(tw_scored)),
    "reddit_rows": int(len(rd_scored)),
    "sp500_rows": int(len(sp_scored)),
}
with open(OUT_DIR / "nb1_run_log.json", "w", encoding="utf-8") as f:
    json.dump(RUN_LOG, f, indent=2, default=str)

print(json.dumps(RUN_LOG["summary"], indent=2))


# =========================
# Post-run verification for Notebook 1
# =========================
import numpy as _np
import pandas as _pd

REQ_COLS = {"date","ticker","text","author","s_finbert","p_pos","p_neu","p_neg","s_deberta","s_distil","model_finbert","model_deberta","model_distil"}
FILES = [
    ("twitter_scored.csv", "twitter"),
    ("reddit_scored.csv", "reddit"),
    ("sp500_scored.csv", "sp500"),
]

def check_scored_file(p: Path, label: str):
    out = {
        "source": label,
        "exists": p.exists(),
        "rows": 0,
        "cols_ok": False,
        "missing_cols": "",
        "date_min": None,
        "date_max": None,
        "dates_ok": False,
        "tickers_ok": False,
        "unique_tickers": "",
        "dup_items": 0,
        "total_nans": 0,
        "probs_close_to_one": None,
        "notes": ""
    }
    if not p.exists():
        out["notes"] = "file missing"
        return out

    df = _pd.read_csv(p, low_memory=False)
    out["rows"] = len(df)
    if len(df) == 0:
        out["notes"] = "file empty"
        return out

    # Required columns
    cols = set(map(str, df.columns))
    missing = sorted(list(REQ_COLS - cols))
    out["missing_cols"] = ";".join(missing)
    out["cols_ok"] = len(missing) == 0

    # Parse and check dates
    if "date" in df.columns:
        d = _pd.to_datetime(df["date"], errors="coerce")
        out["date_min"] = _pd.to_datetime(d.min()).date() if d.notna().any() else None
        out["date_max"] = _pd.to_datetime(d.max()).date() if d.notna().any() else None
        in_range = (d >= _pd.to_datetime(CONFIG["DATE_START"])) & (d <= _pd.to_datetime(CONFIG["DATE_END"]))
        out["dates_ok"] = bool((in_range | d.isna()).all())
    else:
        out["dates_ok"] = False

    # Ticker scope
    if "ticker" in df.columns:
        uniq = sorted(t for t in _pd.Series(df["ticker"].astype(str).str.upper()).unique() if isinstance(t, str))
        out["unique_tickers"] = ",".join(uniq[:20])
        allowed = set(CONFIG["TICKERS"])
        out["tickers_ok"] = set(uniq).issubset(allowed)
    else:
        out["tickers_ok"] = False

    # Simple integrity counts
    out["dup_items"] = int(df.duplicated().sum())
    out["total_nans"] = int(df.isna().sum().sum())

    # Probability sanity (FinBERT only)
    if all(c in df.columns for c in ["p_pos","p_neu","p_neg"]):
        s = _pd.to_numeric(df["p_pos"], errors="coerce").fillna(0) + \
            _pd.to_numeric(df["p_neu"], errors="coerce").fillna(0) + \
            _pd.to_numeric(df["p_neg"], errors="coerce").fillna(0)
        out["probs_close_to_one"] = bool((_np.abs(s - 1.0) < 1e-3).mean() > 0.99)

    # Brief peek for manual inspection
    print(f"\n--- HEAD: {label} ---")
    print(df.head(3).to_string(index=False))

    # Numeric type sanity on key score columns if present
    for c in ["s_finbert","p_pos","p_neu","p_neg","s_deberta","s_distil"]:
        if c in df.columns:
            try:
                _ = _pd.to_numeric(df[c], errors="raise")
            except Exception:
                out["notes"] += f" non-numeric in {c};"

    return out

# Confirm raw source paths also exist (useful if scored files are missing)
print("\nRaw source presence check:")
for k in ["SRC_TW","SRC_RD","SRC_SP","PX_TIDY"]:
    p = Path(CONFIG[k])
    print(f"{k}: {p} -> {'OK' if p.exists() else 'MISSING'}")

# Run checks for the three scored files
results = []
for fname, label in FILES:
    p = INT_DIR / fname
    res = check_scored_file(p, label)
    res["file_path"] = str(p)
    results.append(res)

rep = _pd.DataFrame(results, columns=[
    "source","exists","rows","cols_ok","missing_cols","date_min","date_max",
    "dates_ok","tickers_ok","unique_tickers","dup_items","total_nans","probs_close_to_one","file_path","notes"
])

print("\n=== Notebook 1 scored-file verification ===")
print(rep.to_string(index=False))

# Fail fast hint if anything is off
problems = rep[
    (~rep["exists"]) |
    (rep["rows"] == 0) |
    (~rep["cols_ok"]) |
    (~rep["dates_ok"]) |
    (~rep["tickers_ok"])
]
if len(problems) > 0:
    print("\nIssues detected. Common fixes:")
    print("- Check CONFIG paths for SRC_TW, SRC_RD, SRC_SP, and PX_TIDY.")
    print("- Confirm raw CSVs have any of the expected column names for date/text/author.")
    print("- Confirm date range and ticker list in CONFIG are correct.")
else:
    print("\nAll three scored files look OK.")


# =========================
# Export NB1 artefacts (ZIP + optional download)
# =========================
from pathlib import Path as _Path
import zipfile as _zipfile

zip_path = OUT_DIR / "NB1_item_level_sentiment_outputs.zip"
files_to_include = [
    INT_DIR / "twitter_scored.csv",
    INT_DIR / "reddit_scored.csv",
    INT_DIR / "sp500_scored.csv",
    OUT_DIR / "nb1_run_log.json",
]

missing = [str(p) for p in files_to_include if not p.exists()]
if missing:
    print("WARNING: Missing expected files -> " + ", ".join(missing))
else:
    with _zipfile.ZipFile(zip_path, "w", compression=_zipfile.ZIP_DEFLATED) as zf:
        for p in files_to_include:
            zf.write(p, arcname=p.name)
    print(f"ZIP created at: {zip_path.resolve()} ({zip_path.stat().st_size/1024:.1f} KB)")

    # Try Colab download; otherwise show a clickable link (Jupyter)
    try:
        from google.colab import files as _colab_files
        _colab_files.download(str(zip_path))
        print("Triggered browser download of the ZIP.")
    except Exception:
        try:
            from IPython.display import FileLink, display
            display(FileLink(str(zip_path)))
        except Exception:
            pass

OK: numpy
OK: pandas
OK: tqdm
OK: torch
OK: transformers
OK: accelerate
{'numpy': '2.0.2', 'pandas': '2.2.2', 'tqdm': '4.67.1', 'transformers': '4.55.4', 'torch': '2.8.0+cu126'}
tqdm callable = True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu
Device set to use cpu
Device set to use cpu


Loading sources...
Scoring sources...


twitter finbert:   0%|          | 0/4463 [00:00<?, ?it/s]

twitter deberta:   0%|          | 0/4463 [00:00<?, ?it/s]

twitter distilbert:   0%|          | 0/4463 [00:00<?, ?it/s]

reddit finbert:   0%|          | 0/61 [00:00<?, ?it/s]

reddit deberta:   0%|          | 0/61 [00:00<?, ?it/s]

reddit distilbert:   0%|          | 0/61 [00:00<?, ?it/s]

sp500 finbert:   0%|          | 0/73 [00:00<?, ?it/s]

sp500 deberta:   0%|          | 0/73 [00:00<?, ?it/s]

sp500 distilbert:   0%|          | 0/73 [00:00<?, ?it/s]

{
  "twitter_rows": 35697,
  "reddit_rows": 486,
  "sp500_rows": 580
}

Raw source presence check:
SRC_TW: informal_twitter_stock_tweets.csv -> OK
SRC_RD: informal_reddit_wsb_posts.csv -> OK
SRC_SP: formal_sp500_company_headlines.csv -> OK
PX_TIDY: historical_stock_yfinance_full_2021_2023_tidy.csv -> OK

--- HEAD: twitter ---
      date ticker                                                                                                                                                                                                                                                                                           text  author  s_finbert    p_pos    p_neu    p_neg  s_deberta  s_distil    model_finbert                                           model_deberta                                               model_distil
2022-09-30   TSLA Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Triggered browser download of the ZIP.


In [None]:
# === Export NB1 artefacts (ZIP + optional download)
from pathlib import Path
import zipfile

INT_DIR = Path("intermediates")
OUT_DIR = Path("final_inputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

zip_path = OUT_DIR / "NB1_item_level_sentiment_outputs.zip"
files_to_include = [
    INT_DIR / "twitter_scored.csv",
    INT_DIR / "reddit_scored.csv",
    INT_DIR / "sp500_scored.csv",
    OUT_DIR / "nb1_run_log.json",
]

missing = [str(p) for p in files_to_include if not p.exists()]
if missing:
    raise FileNotFoundError("Missing expected files: " + ", ".join(missing))

with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    for p in files_to_include:
        zf.write(p, arcname=p.name)

print(f"ZIP created at: {zip_path.resolve()} ({zip_path.stat().st_size/1024:.1f} KB)")

# Try Colab download; otherwise show a clickable link (Jupyter)
try:
    from google.colab import files as _colab_files
    _colab_files.download(str(zip_path))
    print("Triggered browser download of the ZIP.")
except Exception:
    try:
        from IPython.display import FileLink, display
        display(FileLink(str(zip_path)))
    except Exception:
        pass