In [4]:
!pip install pytrends


Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Collecting lxml (from pytrends)
  Downloading lxml-6.0.2-cp313-cp313-win_amd64.whl.metadata (3.7 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Downloading lxml-6.0.2-cp313-cp313-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ----------------------- ---------------- 2.4/4.0 MB 20.0 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 17.4 MB/s  0:00:00
Installing collected packages: lxml, pytrends

   ---------------------------------------- 0/2 [lxml]
   ---------------------------------------- 0/2 [lxml]
   -------------------- ------------------- 1/2 [pytrends]
   ---------------------------------------- 2/2 [pytrends]

Successfully installed lxml-6.0.2 pytrends-4.9.2


In [5]:
# === CNT "Wake-Up" Single-Cell Pipeline ======================================
# Dependencies: pandas, numpy, requests, statsmodels, scikit-learn, pytrends, matplotlib
# If needed: pip installs (uncomment)
# import sys, subprocess; [subprocess.check_call([sys.executable,"-m","pip","install",p]) for p in
#  ["pandas","numpy","requests","statsmodels","scikit-learn","pytrends","matplotlib"]]

import os, io, json, math, time, datetime as dt
from pathlib import Path
import numpy as np, pandas as pd, requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.structural import UnobservedComponents
from pytrends.request import TrendReq
import matplotlib.pyplot as plt

# ---------------------------- CONFIG -----------------------------------------
CFG = dict(
    ROOT                = r"C:\Users\caleb\CNT_Lab",   # change if needed
    REGION              = "US",
    HORIZON_WEEKS       = 52,       # forecast horizon
    THRESHOLD_TAU       = 0.65,     # OAI threshold in [0,1]
    RUN_LENGTH_K        = 6,        # sustained weeks above threshold
    BACKTEST_START      = "2012-01-01",
    OAI_TOPICS          = ["government surveillance","censorship","civil liberties","free speech"],
    GDELT_QUERY         = '(censorship OR surveillance OR "civil liberties")',
    PEW_CSV             = r"C:\Users\caleb\CNT_Lab\data\pew_trust_government.csv",   # optional
    GALLUP_CSV          = r"C:\Users\caleb\CNT_Lab\data\gallup_trust_confidence.csv",# optional
    SAVE_DIR            = "artifacts",
    FIG_DIR             = "artifacts\\figures",
    RNG_SEED            = 1337
)

np.random.seed(CFG["RNG_SEED"])
ROOT = Path(CFG["ROOT"]); (ROOT/CFG["SAVE_DIR"]).mkdir(parents=True, exist_ok=True); (ROOT/CFG["FIG_DIR"]).mkdir(parents=True, exist_ok=True)

# ------------------------ UTILITIES ------------------------------------------
def as_week_index(dts):
    """Snap to Monday weeks for consistent alignment."""
    return pd.to_datetime(dts).to_period('W-MON').to_timestamp()

def spectral_entropy(x, nseg=8, eps=1e-12):
    """Simple forecastability proxy: 1 - normalized spectral entropy."""
    x = pd.Series(x).dropna().values
    if len(x) < 16: return np.nan
    x = (x - x.mean()) / (x.std()+1e-9)
    # Welch-ish periodogram
    seg = max(16, len(x)//nseg)
    ps = []
    for i in range(0, len(x)-seg+1, seg):
        X = np.fft.rfft(x[i:i+seg])
        P = (X*np.conj(X)).real
        P = P / (P.sum()+eps)
        ps.append(P)
    Pm = np.mean(ps, axis=0)
    H = -(Pm*np.log(Pm+eps)).sum()
    Hmax = math.log(len(Pm))
    return float(1.0 - H/Hmax)  # higher => more forecastable

def zscore(s):
    m, sd = np.nanmean(s), np.nanstd(s)
    if sd==0 or np.isnan(sd): return pd.Series([0]*len(s), index=s.index)
    return (s-m)/sd

def logistic_scale(s):
    """Scale to ~[0,1] via robust logistic mapping."""
    s = pd.Series(s)
    q1, q2, q3 = s.quantile([0.1,0.5,0.9])
    scale = (q3 - q1)/2.0 if q3>q1 else (s.std() or 1.0)
    return 1/(1+np.exp(-(s-q2)/(scale+1e-9)))

def weekly_resample_sum(df, col, datecol):
    x = df.copy()
    x[datecol] = as_week_index(x[datecol])
    return x.groupby(datecol, as_index=True)[col].sum().sort_index()

def weekly_resample_mean(df, col, datecol):
    x = df.copy()
    x[datecol] = as_week_index(x[datecol])
    return x.groupby(datecol, as_index=True)[col].mean().sort_index()

# ------------------------ DATA SOURCES ---------------------------------------
# 1) Google Trends (topics as keywords; you can map to topic IDs later)
def fetch_trends(topics, geo="US", since="2012-01-01"):
    pytrends = TrendReq(hl="en-US", tz=360)
    frames = []
    for kw in topics:
        try:
            pytrends.build_payload([kw], cat=0, timeframe=f"{since} {dt.date.today():%Y-%m-%d}", geo=geo, gprop="")
            df = pytrends.interest_over_time()
            if df.empty: continue
            s = df[kw].rename(kw)
            s.index = as_week_index(s.index)
            frames.append(s)
            time.sleep(1.2)  # be gentle
        except Exception as e:
            print("Trends fetch error for", kw, "->", e)
    if not frames: 
        return pd.DataFrame()
    out = pd.concat(frames, axis=1).sort_index()
    return out

# 2) GDELT counts (simple doc volume via GDELT 2.1 doc API)
def fetch_gdelt_counts(query, since="2012-01-01", until=None):
    # GDELT Doc API time series; we’ll chunk by month to keep it simple.
    # Reference: https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/
    if until is None: until = dt.date.today().isoformat()
    start = pd.Timestamp(since).to_period("M").to_timestamp()
    end   = pd.Timestamp(until).to_period("M").to_timestamp()
    months = pd.period_range(start, end, freq="M").to_timestamp()
    rows = []
    for m in months:
        try:
            url = ("https://api.gdeltproject.org/api/v2/doc/doc?"
                   f"query={requests.utils.quote(query)}&mode=TimelineVol"
                   f"&format=json&startdatetime={m:%Y%m%d000000}&enddatetime={(m+pd.offsets.MonthEnd(0)):%Y%m%d235959}")
            r = requests.get(url, timeout=20)
            js = r.json()
            if "timelines" not in js or not js["timelines"]:
                continue
            for pt in js["timelines"][0].get("data", []):
                rows.append((pd.to_datetime(pt["date"]), int(pt["value"])))
            time.sleep(0.7)
        except Exception as e:
            print("GDELT fetch error:", e)
    if not rows: return pd.Series(dtype=float)
    s = pd.Series({d:v for d,v in rows}).sort_index()
    s = s.resample("W-MON").sum()
    s.name = "gdelt_volume"
    return s

# 3) Polling spine (optional local CSVs; expected columns)
#    PEW CSV columns: date,value  (value in [0,100] trust/approval or inverse—your choice)
#    GALLUP CSV columns: date,value
def load_optional_poll(csv_path, colname):
    p = Path(csv_path)
    if not p.exists(): return None
    df = pd.read_csv(p)
    # Heuristic parse
    datecol = [c for c in df.columns if "date" in c.lower()][0]
    valcol  = [c for c in df.columns if c!=datecol][0]
    s = weekly_resample_mean(df, valcol, datecol)
    s.name = colname
    return s

# -------------------- INGEST & ALIGN -----------------------------------------
since = CFG["BACKTEST_START"]
trends = fetch_trends(CFG["OAI_TOPICS"], geo=CFG["REGION"], since=since)
gdelt  = fetch_gdelt_counts(CFG["GDELT_QUERY"], since=since)

pew    = load_optional_poll(CFG["PEW_CSV"], "pew_trust")
gallup = load_optional_poll(CFG["GALLUP_CSV"], "gallup_conf")

frames = []
if trends.shape[0]: frames.append(trends)
if isinstance(gdelt, pd.Series) and gdelt.shape[0]: frames.append(gdelt.to_frame())
if pew is not None: frames.append(pew.to_frame())
if gallup is not None: frames.append(gallup.to_frame())

if not frames:
    raise RuntimeError("No data sources available. Provide internet or local CSVs.")

df = pd.concat(frames, axis=1).sort_index()
df = df[~df.index.duplicated(keep="first")]

# Fill small gaps with Kalman-ish fallback (here: forward/back fill + small EMA)
df = df.astype(float)
df = df.fillna(method="ffill").fillna(method="bfill")
df = df.ewm(span=3, adjust=False).mean()

# -------------------- FEATURES & FORECASTABILITY ------------------------------
# Derive basic transforms
feat = pd.DataFrame(index=df.index)
for c in df.columns:
    feat[c] = df[c]
    feat[c+"_z"] = zscore(df[c])
    # volatility
    feat[c+"_vol4"] = df[c].pct_change().rolling(4).std()

# Forecastability weights (ForeCA proxy)
weights = {}
for c in df.columns:
    w = spectral_entropy(df[c].values)
    weights[c] = w
w_series = pd.Series(weights).replace({np.nan: w_series.mean() if 'w_series' in locals() else 0.5})
w_series = (w_series - w_series.min())/(w_series.max()-w_series.min()+1e-12)
w_series = w_series.clip(0.05, 1.0)

# -------------------- BUILD OAI (weighted PCA -> logistic scaling) ------------
# Stack standardized base channels only (original columns)
base = df.copy()
scaler = StandardScaler()
X = scaler.fit_transform(base.values)
# weight columns by sqrt(weight) before PCA
W = np.diag(np.sqrt(w_series.reindex(base.columns).fillna(0.5).values))
Xw = X.dot(W)

pca = PCA(n_components=1, random_state=CFG["RNG_SEED"])
oai_raw = pca.fit_transform(Xw).ravel()
OAI = pd.Series(oai_raw, index=base.index, name="OAI_raw")
OAI = logistic_scale(zscore(OAI))  # [0,1] scaled

# -------------------- NOWCAST via LocalLevel state-space ----------------------
# Use OAI as target; regressors = z-scored exogenous (Trends+GDELT+Polls derivatives)
exo_cols = [c for c in feat.columns if c.endswith("_z") or c.endswith("_vol4")]
Xreg = feat[exo_cols].fillna(0.0)

# Align and fit Unobserved Components (local level + weekly seasonality optional)
y = pd.Series(OAI.values, index=OAI.index, name="OAI")
model = UnobservedComponents(endog=y, level='local', exog=Xreg)
res = model.fit(disp=False)

# Produce a HORIZON forecast (with exog rolled forward by last observed values)
h = CFG["HORIZON_WEEKS"]
lastX = Xreg.iloc[-1:].values
X_future = np.repeat(lastX, h, axis=0)
fc = res.get_forecast(steps=h, exog=X_future)
mean_fc = fc.predicted_mean
ci_fc   = fc.conf_int(alpha=0.20)  # 80% band

idx_future = pd.date_range(y.index[-1] + pd.offsets.Week(1), periods=h, freq="W-MON")
OAI_f = pd.Series(mean_fc, index=idx_future, name="OAI_fc")
OAI_f = OAI_f.clip(0,1)

# -------------------- SIMPLE CHANGE-POINT (CUSUM) ----------------------------
# CUSUM on residuals as a pragmatic online shift detector
resid = y - res.fittedvalues.reindex_like(y).fillna(method="bfill")
k_cusum = resid.std() * 0.25  # reference
h_cusum = resid.std() * 3.0   # threshold
pos, neg = 0.0, 0.0
alarms = []
for t, e in resid.items():
    pos = max(0.0, pos + e - k_cusum)
    neg = min(0.0, neg + e + k_cusum)
    if pos > h_cusum or abs(neg) > h_cusum:
        alarms.append(t); pos = 0.0; neg = 0.0
cp_dates = alarms[-5:]  # keep last few for plotting

# -------------------- TIME-TO-EVENT (threshold + sustain) --------------------
tau = CFG["THRESHOLD_TAU"]; k = CFG["RUN_LENGTH_K"]
# Monte Carlo from Gaussian forecast errors using in-sample resid volatility
sig = float(resid.std() or 0.05)
n_sims = 2000
paths = []
for _ in range(n_sims):
    noise = np.random.normal(0, sig, size=h)
    sim = (OAI_f.values + noise)
    sim = np.clip(sim, 0, 1)
    paths.append(sim)
paths = np.vstack(paths)

def first_sustained(sim, tau, k):
    above = sim >= tau
    run = 0
    for i, a in enumerate(above):
        run = run + 1 if a else 0
        if run >= k: return i  # index within horizon
    return None

hits = [first_sustained(p, tau, k) for p in paths]
hit_idxs = [x for x in hits if x is not None]
if hit_idxs:
    dates = [idx_future[i] for i in hit_idxs]
    median_date = pd.to_datetime(pd.Series(dates)).median()
    p_next_H = len(hit_idxs)/n_sims
    # credible window
    d80 = (pd.Series(dates).sort_values().iloc[int(0.10*len(dates))],
           pd.Series(dates).sort_values().iloc[int(0.90*len(dates))])
else:
    median_date, p_next_H, d80 = None, 0.0, (None, None)

# -------------------- PLOTS ---------------------------------------------------
plt.figure(figsize=(10,5))
plt.plot(y.index, y.values, label="OAI (nowcasted fit)")
plt.plot(res.fittedvalues.index, res.fittedvalues.values.clip(0,1), linestyle="--", label="State-space fit")
plt.axhline(tau, linestyle=":", label=f"Threshold τ={tau}")
for d in cp_dates:
    plt.axvline(d, linestyle=":", alpha=0.6)
plt.title("Overreach Awareness Index (OAI) + fit + change-point hints")
plt.legend()
fig1 = ROOT/CFG["FIG_DIR"]/f"oai_fit_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig1, dpi=160); plt.close()

plt.figure(figsize=(10,4))
plt.plot(OAI_f.index, OAI_f.values, label="Forecast mean")
plt.fill_between(OAI_f.index, ci_fc.iloc[:,0].clip(0,1), ci_fc.iloc[:,1].clip(0,1), alpha=0.2, label="80% band")
plt.axhline(tau, linestyle=":", label=f"τ={tau}")
plt.title("OAI Forecast (next {} weeks)".format(h))
plt.legend()
fig2 = ROOT/CFG["FIG_DIR"]/f"oai_fc_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig2, dpi=160); plt.close()

# Probability curve: chance of sustained crossing within t weeks (empirical CDF)
prob_curve = []
for t in range(h):
    prob_curve.append(np.mean([ (x is not None and x<=t) for x in hits ]))
prob_s = pd.Series(prob_curve, index=idx_future, name="P(T* ≤ t)")

plt.figure(figsize=(10,4))
plt.plot(prob_s.index, prob_s.values, label="Pr(sustained crossing by week t)")
plt.ylim(0,1.0)
plt.title("Time-to-Event Probability (sustained τ for k={} weeks)".format(k))
plt.legend()
fig3 = ROOT/CFG["FIG_DIR"]/f"oai_event_prob_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig3, dpi=160); plt.close()

# -------------------- SAVE SUMMARY -------------------------------------------
summary = dict(
    generated_at = dt.datetime.utcnow().isoformat()+"Z",
    horizon_weeks = h,
    threshold_tau = tau,
    run_length_k = k,
    last_obs_week = y.index[-1].strftime("%Y-%m-%d"),
    cp_dates = [pd.Timestamp(d).strftime("%Y-%m-%d") for d in cp_dates],
    prob_within_horizon = round(float(p_next_H),4),
    median_event_date = (pd.Timestamp(median_date).strftime("%Y-%m-%d") if median_date is not None else None),
    event_window_80 = tuple(d.strftime("%Y-%m-%d") if d is not None else None for d in d80),
    figures = [str(fig1), str(fig2), str(fig3)],
    weights = {k: float(v) for k,v in w_series.to_dict().items()},
    pca_explained_var = float(pca.explained_variance_ratio_[0])
)
out_json = ROOT/CFG["SAVE_DIR"]/f"oai_event_forecast_{int(time.time())}.json"
with open(out_json, "w") as f: json.dump(summary, f, indent=2)

print("== CNT Wake-Up Single-Cell ==")
print(json.dumps(summary, indent=2))
print("\nSaved figures:")
print(" -", fig1)
print(" -", fig2)
print(" -", fig3)
print("\n-> Summary JSON:", out_json)


Trends fetch error for government surveillance -> The request failed: Google returned a response with code 429
Trends fetch error for censorship -> The request failed: Google returned a response with code 429


  df = df.fillna(False)


Trends fetch error for free speech -> The request failed: Google returned a response with code 429
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 colu

  df = df.fillna(method="ffill").fillna(method="bfill")


ValueError: Invalid level/trend specification: 'local'

In [6]:
# === CNT "Wake-Up" Single-Cell Pipeline ======================================
# Dependencies: pandas, numpy, requests, statsmodels, scikit-learn, pytrends, matplotlib
# If needed: pip installs (uncomment)
# import sys, subprocess; [subprocess.check_call([sys.executable,"-m","pip","install",p]) for p in
#  ["pandas","numpy","requests","statsmodels","scikit-learn","pytrends","matplotlib"]]

import os, io, json, math, time, datetime as dt
from pathlib import Path
import numpy as np, pandas as pd, requests
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.structural import UnobservedComponents
from pytrends.request import TrendReq
import matplotlib.pyplot as plt

# ---------------------------- CONFIG -----------------------------------------
CFG = dict(
    ROOT                = r"C:\Users\caleb\CNT_Lab",   # change if needed
    REGION              = "US",
    HORIZON_WEEKS       = 52,       # forecast horizon
    THRESHOLD_TAU       = 0.65,     # OAI threshold in [0,1]
    RUN_LENGTH_K        = 6,        # sustained weeks above threshold
    BACKTEST_START      = "2012-01-01",
    OAI_TOPICS          = ["government surveillance","censorship","civil liberties","free speech"],
    GDELT_QUERY         = '(censorship OR surveillance OR "civil liberties")',
    PEW_CSV             = r"C:\Users\caleb\CNT_Lab\data\pew_trust_government.csv",   # optional
    GALLUP_CSV          = r"C:\Users\caleb\CNT_Lab\data\gallup_trust_confidence.csv",# optional
    SAVE_DIR            = "artifacts",
    FIG_DIR             = "artifacts\\figures",
    RNG_SEED            = 1337
)

np.random.seed(CFG["RNG_SEED"])
ROOT = Path(CFG["ROOT"]); (ROOT/CFG["SAVE_DIR"]).mkdir(parents=True, exist_ok=True); (ROOT/CFG["FIG_DIR"]).mkdir(parents=True, exist_ok=True)

# ------------------------ UTILITIES ------------------------------------------
def as_week_index(dts):
    """Snap to Monday weeks for consistent alignment."""
    return pd.to_datetime(dts).to_period('W-MON').to_timestamp()

def spectral_entropy(x, nseg=8, eps=1e-12):
    """Simple forecastability proxy: 1 - normalized spectral entropy."""
    x = pd.Series(x).dropna().values
    if len(x) < 16: return np.nan
    x = (x - x.mean()) / (x.std()+1e-9)
    # Welch-ish periodogram
    seg = max(16, len(x)//nseg)
    ps = []
    for i in range(0, len(x)-seg+1, seg):
        X = np.fft.rfft(x[i:i+seg])
        P = (X*np.conj(X)).real
        P = P / (P.sum()+eps)
        ps.append(P)
    Pm = np.mean(ps, axis=0)
    H = -(Pm*np.log(Pm+eps)).sum()
    Hmax = math.log(len(Pm))
    return float(1.0 - H/Hmax)  # higher => more forecastable

def zscore(s):
    m, sd = np.nanmean(s), np.nanstd(s)
    if sd==0 or np.isnan(sd): return pd.Series([0]*len(s), index=s.index)
    return (s-m)/sd

def logistic_scale(s):
    """Scale to ~[0,1] via robust logistic mapping."""
    s = pd.Series(s)
    q1, q2, q3 = s.quantile([0.1,0.5,0.9])
    scale = (q3 - q1)/2.0 if q3>q1 else (s.std() or 1.0)
    return 1/(1+np.exp(-(s-q2)/(scale+1e-9)))

def weekly_resample_sum(df, col, datecol):
    x = df.copy()
    x[datecol] = as_week_index(x[datecol])
    return x.groupby(datecol, as_index=True)[col].sum().sort_index()

def weekly_resample_mean(df, col, datecol):
    x = df.copy()
    x[datecol] = as_week_index(x[datecol])
    return x.groupby(datecol, as_index=True)[col].mean().sort_index()

# ------------------------ DATA SOURCES ---------------------------------------
# 1) Google Trends (topics as keywords; you can map to topic IDs later)
def fetch_trends(topics, geo="US", since="2012-01-01"):
    pytrends = TrendReq(hl="en-US", tz=360)
    frames = []
    for kw in topics:
        try:
            pytrends.build_payload([kw], cat=0, timeframe=f"{since} {dt.date.today():%Y-%m-%d}", geo=geo, gprop="")
            df = pytrends.interest_over_time()
            if df.empty: continue
            s = df[kw].rename(kw)
            s.index = as_week_index(s.index)
            frames.append(s)
            time.sleep(1.2)  # be gentle
        except Exception as e:
            print("Trends fetch error for", kw, "->", e)
    if not frames: 
        return pd.DataFrame()
    out = pd.concat(frames, axis=1).sort_index()
    return out

# 2) GDELT counts (simple doc volume via GDELT 2.1 doc API)
def fetch_gdelt_counts(query, since="2012-01-01", until=None):
    # GDELT Doc API time series; we’ll chunk by month to keep it simple.
    # Reference: https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/
    if until is None: until = dt.date.today().isoformat()
    start = pd.Timestamp(since).to_period("M").to_timestamp()
    end   = pd.Timestamp(until).to_period("M").to_timestamp()
    months = pd.period_range(start, end, freq="M").to_timestamp()
    rows = []
    for m in months:
        try:
            url = ("https://api.gdeltproject.org/api/v2/doc/doc?"
                   f"query={requests.utils.quote(query)}&mode=TimelineVol"
                   f"&format=json&startdatetime={m:%Y%m%d000000}&enddatetime={(m+pd.offsets.MonthEnd(0)):%Y%m%d235959}")
            r = requests.get(url, timeout=20)
            js = r.json()
            if "timelines" not in js or not js["timelines"]:
                continue
            for pt in js["timelines"][0].get("data", []):
                rows.append((pd.to_datetime(pt["date"]), int(pt["value"])))
            time.sleep(0.7)
        except Exception as e:
            print("GDELT fetch error:", e)
    if not rows: return pd.Series(dtype=float)
    s = pd.Series({d:v for d,v in rows}).sort_index()
    s = s.resample("W-MON").sum()
    s.name = "gdelt_volume"
    return s

# 3) Polling spine (optional local CSVs; expected columns)
#    PEW CSV columns: date,value  (value in [0,100] trust/approval or inverse—your choice)
#    GALLUP CSV columns: date,value
def load_optional_poll(csv_path, colname):
    p = Path(csv_path)
    if not p.exists(): return None
    df = pd.read_csv(p)
    # Heuristic parse
    datecol = [c for c in df.columns if "date" in c.lower()][0]
    valcol  = [c for c in df.columns if c!=datecol][0]
    s = weekly_resample_mean(df, valcol, datecol)
    s.name = colname
    return s

# -------------------- INGEST & ALIGN -----------------------------------------
since = CFG["BACKTEST_START"]
trends = fetch_trends(CFG["OAI_TOPICS"], geo=CFG["REGION"], since=since)
gdelt  = fetch_gdelt_counts(CFG["GDELT_QUERY"], since=since)

pew    = load_optional_poll(CFG["PEW_CSV"], "pew_trust")
gallup = load_optional_poll(CFG["GALLUP_CSV"], "gallup_conf")

frames = []
if trends.shape[0]: frames.append(trends)
if isinstance(gdelt, pd.Series) and gdelt.shape[0]: frames.append(gdelt.to_frame())
if pew is not None: frames.append(pew.to_frame())
if gallup is not None: frames.append(gallup.to_frame())

if not frames:
    raise RuntimeError("No data sources available. Provide internet or local CSVs.")

df = pd.concat(frames, axis=1).sort_index()
df = df[~df.index.duplicated(keep="first")]

# Fill small gaps with Kalman-ish fallback (here: forward/back fill + small EMA)
df = df.astype(float)
df = df.fillna(method="ffill").fillna(method="bfill")
df = df.ewm(span=3, adjust=False).mean()

# -------------------- FEATURES & FORECASTABILITY ------------------------------
# Derive basic transforms
feat = pd.DataFrame(index=df.index)
for c in df.columns:
    feat[c] = df[c]
    feat[c+"_z"] = zscore(df[c])
    # volatility
    feat[c+"_vol4"] = df[c].pct_change().rolling(4).std()

# Forecastability weights (ForeCA proxy)
weights = {}
for c in df.columns:
    w = spectral_entropy(df[c].values)
    weights[c] = w
w_series = pd.Series(weights).replace({np.nan: w_series.mean() if 'w_series' in locals() else 0.5})
w_series = (w_series - w_series.min())/(w_series.max()-w_series.min()+1e-12)
w_series = w_series.clip(0.05, 1.0)

# -------------------- BUILD OAI (weighted PCA -> logistic scaling) ------------
# Stack standardized base channels only (original columns)
base = df.copy()
scaler = StandardScaler()
X = scaler.fit_transform(base.values)
# weight columns by sqrt(weight) before PCA
W = np.diag(np.sqrt(w_series.reindex(base.columns).fillna(0.5).values))
Xw = X.dot(W)

pca = PCA(n_components=1, random_state=CFG["RNG_SEED"])
oai_raw = pca.fit_transform(Xw).ravel()
OAI = pd.Series(oai_raw, index=base.index, name="OAI_raw")
OAI = logistic_scale(zscore(OAI))  # [0,1] scaled

# -------------------- NOWCAST via LocalLevel state-space ----------------------
# Use OAI as target; regressors = z-scored exogenous (Trends+GDELT+Polls derivatives)
exo_cols = [c for c in feat.columns if c.endswith("_z") or c.endswith("_vol4")]
Xreg = feat[exo_cols].fillna(0.0)

# Align and fit Unobserved Components (local level + weekly seasonality optional)
y = pd.Series(OAI.values, index=OAI.index, name="OAI")
model = UnobservedComponents(endog=y, level='local', exog=Xreg)
res = model.fit(disp=False)

# Produce a HORIZON forecast (with exog rolled forward by last observed values)
h = CFG["HORIZON_WEEKS"]
lastX = Xreg.iloc[-1:].values
X_future = np.repeat(lastX, h, axis=0)
fc = res.get_forecast(steps=h, exog=X_future)
mean_fc = fc.predicted_mean
ci_fc   = fc.conf_int(alpha=0.20)  # 80% band

idx_future = pd.date_range(y.index[-1] + pd.offsets.Week(1), periods=h, freq="W-MON")
OAI_f = pd.Series(mean_fc, index=idx_future, name="OAI_fc")
OAI_f = OAI_f.clip(0,1)

# -------------------- SIMPLE CHANGE-POINT (CUSUM) ----------------------------
# CUSUM on residuals as a pragmatic online shift detector
resid = y - res.fittedvalues.reindex_like(y).fillna(method="bfill")
k_cusum = resid.std() * 0.25  # reference
h_cusum = resid.std() * 3.0   # threshold
pos, neg = 0.0, 0.0
alarms = []
for t, e in resid.items():
    pos = max(0.0, pos + e - k_cusum)
    neg = min(0.0, neg + e + k_cusum)
    if pos > h_cusum or abs(neg) > h_cusum:
        alarms.append(t); pos = 0.0; neg = 0.0
cp_dates = alarms[-5:]  # keep last few for plotting

# -------------------- TIME-TO-EVENT (threshold + sustain) --------------------
tau = CFG["THRESHOLD_TAU"]; k = CFG["RUN_LENGTH_K"]
# Monte Carlo from Gaussian forecast errors using in-sample resid volatility
sig = float(resid.std() or 0.05)
n_sims = 2000
paths = []
for _ in range(n_sims):
    noise = np.random.normal(0, sig, size=h)
    sim = (OAI_f.values + noise)
    sim = np.clip(sim, 0, 1)
    paths.append(sim)
paths = np.vstack(paths)

def first_sustained(sim, tau, k):
    above = sim >= tau
    run = 0
    for i, a in enumerate(above):
        run = run + 1 if a else 0
        if run >= k: return i  # index within horizon
    return None

hits = [first_sustained(p, tau, k) for p in paths]
hit_idxs = [x for x in hits if x is not None]
if hit_idxs:
    dates = [idx_future[i] for i in hit_idxs]
    median_date = pd.to_datetime(pd.Series(dates)).median()
    p_next_H = len(hit_idxs)/n_sims
    # credible window
    d80 = (pd.Series(dates).sort_values().iloc[int(0.10*len(dates))],
           pd.Series(dates).sort_values().iloc[int(0.90*len(dates))])
else:
    median_date, p_next_H, d80 = None, 0.0, (None, None)

# -------------------- PLOTS ---------------------------------------------------
plt.figure(figsize=(10,5))
plt.plot(y.index, y.values, label="OAI (nowcasted fit)")
plt.plot(res.fittedvalues.index, res.fittedvalues.values.clip(0,1), linestyle="--", label="State-space fit")
plt.axhline(tau, linestyle=":", label=f"Threshold τ={tau}")
for d in cp_dates:
    plt.axvline(d, linestyle=":", alpha=0.6)
plt.title("Overreach Awareness Index (OAI) + fit + change-point hints")
plt.legend()
fig1 = ROOT/CFG["FIG_DIR"]/f"oai_fit_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig1, dpi=160); plt.close()

plt.figure(figsize=(10,4))
plt.plot(OAI_f.index, OAI_f.values, label="Forecast mean")
plt.fill_between(OAI_f.index, ci_fc.iloc[:,0].clip(0,1), ci_fc.iloc[:,1].clip(0,1), alpha=0.2, label="80% band")
plt.axhline(tau, linestyle=":", label=f"τ={tau}")
plt.title("OAI Forecast (next {} weeks)".format(h))
plt.legend()
fig2 = ROOT/CFG["FIG_DIR"]/f"oai_fc_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig2, dpi=160); plt.close()

# Probability curve: chance of sustained crossing within t weeks (empirical CDF)
prob_curve = []
for t in range(h):
    prob_curve.append(np.mean([ (x is not None and x<=t) for x in hits ]))
prob_s = pd.Series(prob_curve, index=idx_future, name="P(T* ≤ t)")

plt.figure(figsize=(10,4))
plt.plot(prob_s.index, prob_s.values, label="Pr(sustained crossing by week t)")
plt.ylim(0,1.0)
plt.title("Time-to-Event Probability (sustained τ for k={} weeks)".format(k))
plt.legend()
fig3 = ROOT/CFG["FIG_DIR"]/f"oai_event_prob_{int(time.time())}.png"
plt.tight_layout(); plt.savefig(fig3, dpi=160); plt.close()

# -------------------- SAVE SUMMARY -------------------------------------------
summary = dict(
    generated_at = dt.datetime.utcnow().isoformat()+"Z",
    horizon_weeks = h,
    threshold_tau = tau,
    run_length_k = k,
    last_obs_week = y.index[-1].strftime("%Y-%m-%d"),
    cp_dates = [pd.Timestamp(d).strftime("%Y-%m-%d") for d in cp_dates],
    prob_within_horizon = round(float(p_next_H),4),
    median_event_date = (pd.Timestamp(median_date).strftime("%Y-%m-%d") if median_date is not None else None),
    event_window_80 = tuple(d.strftime("%Y-%m-%d") if d is not None else None for d in d80),
    figures = [str(fig1), str(fig2), str(fig3)],
    weights = {k: float(v) for k,v in w_series.to_dict().items()},
    pca_explained_var = float(pca.explained_variance_ratio_[0])
)
out_json = ROOT/CFG["SAVE_DIR"]/f"oai_event_forecast_{int(time.time())}.json"
with open(out_json, "w") as f: json.dump(summary, f, indent=2)

print("== CNT Wake-Up Single-Cell ==")
print(json.dumps(summary, indent=2))
print("\nSaved figures:")
print(" -", fig1)
print(" -", fig2)
print(" -", fig3)
print("\n-> Summary JSON:", out_json)


Trends fetch error for government surveillance -> The request failed: Google returned a response with code 429
Trends fetch error for censorship -> The request failed: Google returned a response with code 429


  df = df.fillna(False)
  df = df.fillna(False)


GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expecting value: line 1 column 1 (char 0)
GDELT fetch error: Expec

  df = df.fillna(method="ffill").fillna(method="bfill")


ValueError: Invalid level/trend specification: 'local'