In [3]:
# Train & Save a simple pipeline to model/model.pkl
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib
import numpy as np

PROC = Path("../data/processed")
MODEL_DIR = Path("../model"); MODEL_DIR.mkdir(exist_ok=True)

# 1) Load your latest model-ready features (update pattern if yours differs)
cands = sorted(PROC.glob("prices_with_tech_features_model*.csv"),
               key=lambda p: p.stat().st_mtime, reverse=True)
assert cands, "No model-ready features found under data/processed/. Expected files like prices_with_tech_features_model*.csv"
df = pd.read_csv(cands[0], parse_dates=["date"]).sort_values("date").reset_index(drop=True)

# 2) Define features + target
FEAT = ["gap_pct","daily_range_pct","ma_ratio_5_20","ret_vol_10","volume_z20","rsi_14","macd","macd_signal"]
df["y_up"] = (df["ret_1d"].shift(-1) > 0).astype(int)   # next-day direction

dfm = df.dropna(subset=FEAT + ["y_up"]).copy()
cut = int(len(dfm)*0.8)
train = dfm.iloc[:cut]
X_tr, y_tr = train[FEAT], train["y_up"]

# 3) Fit a simple pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000))
])
pipe.fit(X_tr, y_tr)

# 4) Save bundle
bundle = {"pipeline": pipe, "features": FEAT, "threshold": 0.44,
          "trained_on": cands[0].name, "version": "1.0.0"}
joblib.dump(bundle, MODEL_DIR / "model.pkl")
print("Saved:", MODEL_DIR / "model.pkl")

# 5) Quick reload smoke test
b2 = joblib.load(MODEL_DIR / "model.pkl")
x0 = np.zeros((1, len(b2["features"])))
print("Smoke test proba:", b2["pipeline"].predict_proba(x0)[:,1])


Saved: ../model/model.pkl
Smoke test proba: [0.28148647]


