# Sentiment-Enhanced Baseline (last 30 days overlap)

This notebook adds **news sentiment** to your price-only baseline for the **overlapping period** you have news for (e.g., last 30 days).

We will:
1. Load `adj_close_wide.parquet` and compute returns.
2. Load `daily_sentiment_last_30d.parquet` (from your local aggregation) and align it to business days.
3. Build a feature set: lagged returns + sentiment features (same-day, 7-day, 14-day).
4. Train/test split and compare **Ridge (price-only)** vs **Ridge (price + sentiment)** on the **same dates**.
5. Save the sentiment-enhanced model artifacts.

**Note**: News APIs often give ~30 days of history on entry plans; so this notebook focuses on that window to show the *incremental value* of sentiment.

In [None]:
#@title 0) Setup
!pip -q install pandas pyarrow scikit-learn joblib matplotlib --upgrade
import pandas as pd, numpy as np, os, json
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
print('Setup complete.')

In [None]:
#@title 1) Inputs and parameters
ADJ_CLOSE_WIDE = "/content/drive/MyDrive/ai_stock_watcher/data/curated/adj_close_wide.parquet"  #@param {type:"string"}
DAILY_SENT_30D = "/content/drive/MyDrive/ai_stock_watcher/data/curated/daily_sentiment_last_30d.parquet"  #@param {type:"string"}
TARGET_TICKER = "AAPL"  #@param {type:"string"}
LAGS = 20  #@param {type:"integer"}
TEST_SPLIT_FRACTION = 0.2  #@param {type:"number"}
ART_DIR = "/content/drive/MyDrive/ai_stock_watcher/artifacts"  #@param {type:"string"}
Path(ART_DIR).mkdir(parents=True, exist_ok=True)
print('ART_DIR =', ART_DIR)

In [None]:
#@title 2) Load prices (5y) → returns; load daily sentiment (30d)
prices = pd.read_parquet(ADJ_CLOSE_WIDE).sort_index().asfreq('B')
if TARGET_TICKER not in prices.columns:
    raise ValueError(f"{TARGET_TICKER} not found in adj_close_wide.parquet columns.")
rets = prices.pct_change()
target_ret = rets[[TARGET_TICKER]].rename(columns={TARGET_TICKER: 'ret'})

sent = pd.read_parquet(DAILY_SENT_30D)
if sent.empty:
    raise SystemExit("daily_sentiment_last_30d.parquet is empty. Run your local aggregation first.")
sent['date'] = pd.to_datetime(sent['date'])
sent_t = sent[sent['ticker'] == TARGET_TICKER].copy()
if sent_t.empty:
    raise SystemExit(f"No sentiment rows for {TARGET_TICKER}.")
sent_t = sent_t.set_index('date').sort_index()
sent_t = sent_t.asfreq('B')  # business days
sent_t['sent'] = sent_t['sent'].fillna(0.0)
sent_t['sent_ma7'] = sent_t['sent'].rolling(7, min_periods=1).mean()
sent_t['sent_ma14'] = sent_t['sent'].rolling(14, min_periods=1).mean()

display(target_ret.tail())
display(sent_t.tail())

In [None]:
#@title 3) Build aligned feature frame on the overlapping dates
def make_lagged(y: pd.Series, lags:int) -> pd.DataFrame:
    df = pd.DataFrame({"y": y})
    for k in range(1, lags+1):
        df[f"lag_{k}"] = df['y'].shift(k)
    return df

# Align by index: start with returns, then join sentiment
df = make_lagged(target_ret['ret'], LAGS)
df = df.join(sent_t[['sent','sent_ma7','sent_ma14']], how='left')
df[['sent','sent_ma7','sent_ma14']] = df[['sent','sent_ma7','sent_ma14']].fillna(0.0)
df = df.dropna(subset=[f'lag_{LAGS}'])

# Predict next-day return (shift target by -1)
y_next = df['y'].shift(-1).dropna()
X_all = df.drop(columns=['y']).iloc[:-1]
assert len(X_all) == len(y_next)

# Price-only view on same rows
X_price_only = X_all.filter(like='lag_')
X_with_sent = X_all.copy()

print('Rows available for comparison:', len(X_with_sent))
display(X_with_sent.tail())

In [None]:
#@title 4) Train/test split (time-based) and compare
split = int(len(X_with_sent) * (1 - TEST_SPLIT_FRACTION))
Xtr_p, Xte_p = X_price_only.iloc[:split], X_price_only.iloc[split:]
Xtr_s, Xte_s = X_with_sent.iloc[:split], X_with_sent.iloc[split:]
ytr, yte = y_next.iloc[:split], y_next.iloc[split:]

def fit_eval(Xtr, Xte, ytr, yte, label):
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0, random_state=42))
    ])
    pipe.fit(Xtr, ytr)
    yhat = pipe.predict(Xte)
    rmse = mean_squared_error(yte, yhat, squared=False)
    mae = mean_absolute_error(yte, yhat)
    dacc = (np.sign(yhat) == np.sign(yte.values)).mean()
    print(f"{label} → RMSE={rmse:.6f}  MAE={mae:.6f}  DirAcc={dacc:.3f}")
    return pipe, yhat, {"rmse": float(rmse), "mae": float(mae), "dir_acc": float(dacc)}

pipe_p, yhat_p, m_price = fit_eval(Xtr_p, Xte_p, ytr, yte, label="Ridge (price-only)")
pipe_s, yhat_s, m_sent  = fit_eval(Xtr_s, Xte_s, ytr, yte, label="Ridge (price+sentiment)")

In [None]:
#@title 5) Plot test predictions (price-only vs with sentiment)
plt.figure()
plt.plot(yte.index, yte.values, label='Actual')
plt.plot(yte.index, yhat_p, label='Price-only')
plt.plot(yte.index, yhat_s, label='Price+Sentiment')
plt.title(f"{TARGET_TICKER} — Test returns: adding sentiment")
plt.legend(); plt.xlabel('Date'); plt.ylabel('Daily return'); plt.show()

In [None]:
#@title 6) Save sentiment-enhanced model and metadata
sent_model_path = os.path.join(ART_DIR, f"ridge_with_sent_{TARGET_TICKER}.joblib")
joblib.dump(pipe_s, sent_model_path)
meta = {
    "ticker": TARGET_TICKER,
    "lags": int(LAGS),
    "features_price_only": list(X_price_only.columns),
    "features_with_sent": list(X_with_sent.columns),
    "metrics": {
        "price_only": m_price,
        "with_sent": m_sent
    }
}
with open(os.path.join(ART_DIR, f"ridge_with_sent_{TARGET_TICKER}_meta.json"), 'w') as f:
    json.dump(meta, f, indent=2)
print('Saved model →', sent_model_path)
print('Saved meta  →', os.path.join(ART_DIR, f"ridge_with_sent_{TARGET_TICKER}_meta.json"))