# Price-Only Baselines (5y) — AI Stock Watcher

This notebook trains **price-only** baselines using the 5-year adjusted close panel (`adj_close_wide.parquet`).
We start simple and strong:

1. **Naive-1** baseline: predict next-day return = last return.
2. **Ridge (lagged returns)** baseline: predict next-day return using the last *L* days of returns.
3. *(Optional)* **ARIMA** quick baseline for comparison.

Outputs:
- `artifacts/price_only_ridge.joblib` — StandardScaler+Ridge pipeline
- `artifacts/price_only_meta.json` — feature/ticker metadata
- `artifacts/arima_<TICKER>.pkl` (optional)

**Usage**: Upload or mount your `adj_close_wide.parquet` and set the input path below.

In [None]:
#@title 0) Setup
!pip -q install pandas pyarrow scikit-learn statsmodels joblib matplotlib --upgrade
import pandas as pd, numpy as np, json, os, math
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
print('Setup complete.')

In [None]:
#@title 1) Input paths and parameters
ADJ_CLOSE_WIDE = "/content/drive/MyDrive/ai_stock_watcher/data/curated/adj_close_wide.parquet"  #@param {type:"string"}
TARGET_TICKER = "AAPL"  #@param {type:"string"}
LAGS = 20  #@param {type:"integer"}
TEST_SPLIT_FRACTION = 0.2  #@param {type:"number"}
ART_DIR = "/content/drive/MyDrive/ai_stock_watcher/artifacts"  #@param {type:"string"}

Path(ART_DIR).mkdir(parents=True, exist_ok=True)
print('ART_DIR =', ART_DIR)

In [None]:
#@title 2) Load 5y panel and compute returns
prices = pd.read_parquet(ADJ_CLOSE_WIDE)
prices = prices.sort_index().asfreq('B')  # business days
if TARGET_TICKER not in prices.columns:
    raise ValueError(f"{TARGET_TICKER} not found in adj_close_wide.parquet columns.")

# simple returns (you can switch to log returns if you prefer)
rets = prices.pct_change()
target = rets[[TARGET_TICKER]].rename(columns={TARGET_TICKER: 'y'})
display(target.tail())
print('Data points:', len(target), 'Date range:', target.index.min(), '→', target.index.max())

In [None]:
#@title 3) Build lagged-return features
def make_lagged_features(y: pd.Series, lags: int) -> pd.DataFrame:
    df = pd.DataFrame({"y": y})
    for k in range(1, lags+1):
        df[f"lag_{k}"] = df['y'].shift(k)
    # Drop first 'lags' rows with NaNs
    return df.dropna()

df = make_lagged_features(target['y'], LAGS)
X = df.filter(like='lag_')
y = df['y'].shift(-1).dropna()  # predict next-day return
X = X.iloc[:-1]
assert len(X) == len(y)
display(df.tail())
print('Feature matrix:', X.shape)

In [None]:
#@title 4) Train/validation split (time-based)
split = int(len(X) * (1 - TEST_SPLIT_FRACTION))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
print('Train:', X_train.shape, 'Test:', X_test.shape)

# Naive-1 baseline: predict next = last return
yhat_naive = X_test['lag_1'].values
rmse_naive = mean_squared_error(y_test, yhat_naive, squared=False)
mae_naive = mean_absolute_error(y_test, yhat_naive)
dir_acc_naive = (np.sign(yhat_naive) == np.sign(y_test.values)).mean()
print(f"Naive-1 → RMSE={rmse_naive:.6f}  MAE={mae_naive:.6f}  DirAcc={dir_acc_naive:.3f}")

In [None]:
#@title 5) Ridge baseline (StandardScaler + Ridge)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0, random_state=42))
])
pipe.fit(X_train, y_train)
yhat = pipe.predict(X_test)

rmse = mean_squared_error(y_test, yhat, squared=False)
mae = mean_absolute_error(y_test, yhat)
dir_acc = (np.sign(yhat) == np.sign(y_test.values)).mean()
print(f"Ridge → RMSE={rmse:.6f}  MAE={mae:.6f}  DirAcc={dir_acc:.3f}")

In [None]:
#@title 6) Plot: actual vs predicted returns (test set)
plt.figure()
plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, yhat, label='Predicted')
plt.title(f"{TARGET_TICKER} — Returns (test set)")
plt.legend()
plt.xlabel('Date'); plt.ylabel('Daily return')
plt.show()

In [None]:
#@title 7) (Optional) ARIMA quick baseline
USE_ARIMA = False  #@param {type:"boolean"}
ARIMA_ORDER = (1,1,1)  #@param {type:"raw"}
if USE_ARIMA:
    import warnings
    warnings.filterwarnings('ignore')
    from statsmodels.tsa.arima.model import ARIMA
    p = prices[TARGET_TICKER].dropna()
    p = p.loc[X_train.index.min():X_test.index.max()]
    p_train = p.loc[:X_train.index.max()]
    p_test = p.loc[X_test.index.min():]
    model = ARIMA(p_train, order=ARIMA_ORDER)
    res = model.fit()
    fc = res.forecast(steps=len(p_test))
    fc_ret = fc.pct_change().reindex(y_test.index).fillna(0.0)
    rmse_a = mean_squared_error(y_test, fc_ret, squared=False)
    mae_a = mean_absolute_error(y_test, fc_ret)
    dir_acc_a = (np.sign(fc_ret.values) == np.sign(y_test.values)).mean()
    print(f"ARIMA{ARIMA_ORDER} → RMSE={rmse_a:.6f}  MAE={mae_a:.6f}  DirAcc={dir_acc_a:.3f}")
    import pickle, os
    with open(os.path.join(ART_DIR, f"arima_{TARGET_TICKER}.pkl"), 'wb') as f:
        pickle.dump(res, f)
    print('Saved ARIMA model:', os.path.join(ART_DIR, f"arima_{TARGET_TICKER}.pkl"))

In [None]:
#@title 8) Save artifacts (pipeline + metadata)
pipe_path = os.path.join(ART_DIR, "price_only_ridge.joblib")
joblib.dump(pipe, pipe_path)
meta = {
    "ticker": TARGET_TICKER,
    "lags": int(LAGS),
    "features": list(X.columns),
    "metrics": {
        "ridge": {"rmse": float(rmse), "mae": float(mae), "dir_acc": float(dir_acc)},
        "naive1": {"rmse": float(rmse_naive), "mae": float(mae_naive), "dir_acc": float(dir_acc_naive)}
    }
}
with open(os.path.join(ART_DIR, "price_only_meta.json"), 'w') as f:
    json.dump(meta, f, indent=2)
print('Saved pipeline →', pipe_path)
print('Saved meta →', os.path.join(ART_DIR, 'price_only_meta.json'))

In [None]:
#@title 9) Local inference helper (Ridge) — price-only
import json, joblib, pandas as pd
from pathlib import Path

def load_price_only(art_dir:str):
    pipe = joblib.load(os.path.join(art_dir, 'price_only_ridge.joblib'))
    meta = json.loads(Path(os.path.join(art_dir, 'price_only_meta.json')).read_text())
    return pipe, meta

def build_window_from_series(returns: pd.Series, lags:int) -> pd.DataFrame:
    row = {f'lag_{k}': returns.iloc[-k] for k in range(1, lags+1)}
    return pd.DataFrame([row])

# Example usage (requires a returns series for TARGET_TICKER):
# pipe, meta = load_price_only(ART_DIR)
# latest_window = build_window_from_series(rets[TARGET_TICKER].dropna(), meta['lags'])
# pred_next_ret = pipe.predict(latest_window)[0]
# print('Predicted next-day return:', float(pred_next_ret))