# Stock Forecasting Notebook

This notebook mirrors the CLI workflow in `src/main.py`.
Set your ticker and horizon (e.g., `"AAPL"`, `"6 days"`, `"3 months"`, `"1 year"`), then run cells top to bottom.

In [None]:
import math
import os
import re
import warnings
from typing import Dict, Optional, Tuple
# Ensure local src/ is on path so `lib.*` imports work when running from notebooks/
import sys
import pathlib
CWD = pathlib.Path(__file__).resolve().parent
repo_root = CWD.parent
src_path = repo_root / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import yfinance as yf
from arch import arch_model
from dotenv import load_dotenv
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor

from lib.data_utils import build_exog
from lib.fetchers import fetch_yfinance
from lib.model_utils import run_backtest

warnings.filterwarnings("ignore")
load_dotenv()
yf.set_tz_cache_location("~/.cache/yfinance")

session = requests.Session()
session.headers.update(
    {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
        )
    }
)

API_KEY = os.getenv("API_KEY")
FRED_API_KEY = os.getenv("FRED_API_KEY", "dde92ad42f9e89f8f7d889a4b79f2efb")


ModuleNotFoundError: No module named 'lib'

In [None]:
# Parameters
ticker = "AAPL"
horizon_text = "6 weeks"  # examples: "6 days", "3 months", "1 year"

# Simulation parameters
num_sims = 500  # Monte Carlo paths


In [None]:
def parse_horizon(text: str) -> Tuple[float, str]:
    unit_map = {
        "d": "day",
        "day": "day",
        "days": "day",
        "w": "week",
        "week": "week",
        "weeks": "week",
        "m": "month",
        "mo": "month",
        "month": "month",
        "months": "month",
        "y": "year",
        "yr": "year",
        "year": "year",
        "years": "year",
    }
    pattern = re.compile(r"^\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)\s*$")
    match = pattern.match(text.lower())
    if not match:
        raise ValueError("Horizon must be like '6 days', '3 months', '1 year'.")
    value = float(match.group(1))
    unit_raw = match.group(2)
    unit = unit_map.get(unit_raw)
    if unit is None:
        raise ValueError("Unsupported unit. Use days, weeks, months, or years.")
    return value, unit


def compute_horizon_settings(value: float, unit: str) -> Dict[str, object]:
    if unit == "day":
        steps = max(1, int(math.ceil(value)))
        mode = "D"
        interval = "1d"
        freq = "B"
        invested_days = steps
    elif unit == "week":
        steps = max(1, int(math.ceil(value)))
        mode = "W"
        interval = "1wk"
        freq = "W-FRI"
        invested_days = steps * 5
    elif unit == "month":
        steps = max(1, int(math.ceil(value)))
        mode = "M"
        interval = "1mo"
        freq = "ME"
        invested_days = steps * 21
    else:
        steps = max(1, int(math.ceil(value * 12)))
        mode = "M"
        interval = "1mo"
        freq = "ME"
        invested_days = steps * 21

    if invested_days <= 90:
        download_period = "90d"
    elif invested_days <= 252:
        download_period = "1y"
    elif invested_days <= 504:
        download_period = "2y"
    elif invested_days <= 1260:
        download_period = "5y"
    else:
        download_period = "10y"

    label_unit = unit if value == 1 else f"{unit}s"
    label = f"{value:g} {label_unit}"

    return {
        "steps": steps,
        "mode": mode,
        "interval": interval,
        "freq": freq,
        "invested_days": invested_days,
        "download_period": download_period,
        "label": label,
    }


def forecast_with_xgb(log_returns: pd.Series, exog_df: Optional[pd.DataFrame], steps: int, lags: int = 5) -> Optional[np.ndarray]:
    aligned_exog = None
    if exog_df is not None and not exog_df.empty:
        aligned_exog = exog_df.reindex(log_returns.index)

    def make_design(series: pd.Series, exog: Optional[pd.DataFrame]):
        df = pd.DataFrame({"target": series})
        for i in range(1, lags + 1):
            df[f"lag_{i}"] = series.shift(i)
        if exog is not None:
            for col in exog.columns:
                df[col] = exog[col]
        df = df.dropna()
        y_local = df["target"]
        X_local = df.drop(columns="target")
        return X_local, y_local

    X_train, y_train = make_design(log_returns, aligned_exog)
    if len(y_train) < max(20, lags + 1):
        return None

    model = XGBRegressor(
        n_estimators=300,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
    )
    model.fit(X_train, y_train)

    history = list(log_returns.values)
    exog_last = aligned_exog.iloc[-1] if aligned_exog is not None else None
    feature_cols = list(X_train.columns)
    preds = []

    for _ in range(steps):
        row = {}
        for col in feature_cols:
            if col.startswith("lag_"):
                lag_idx = int(col.split("_")[1])
                row[col] = history[-lag_idx]
            elif exog_last is not None and col in exog_last:
                row[col] = exog_last[col]
        input_df = pd.DataFrame([row], columns=feature_cols)
        next_ret = float(model.predict(input_df)[0])
        preds.append(next_ret)
        history.append(next_ret)
    return np.asarray(preds)


def forecast_from_arima(model, model_type: str, steps: int) -> np.ndarray:
    if model_type == "arima_fixed":
        return model.forecast(steps=steps).to_numpy().flatten()
    return np.asarray(model.predict(n_periods=steps)).flatten()


In [None]:
h_val, h_unit = parse_horizon(horizon_text)
h_settings = compute_horizon_settings(h_val, h_unit)
h_settings


In [None]:
data = fetch_yfinance(ticker, h_settings["download_period"], h_settings["interval"], session)
if data.empty:
    raise ValueError("No data returned for the selected ticker and period.")

if isinstance(data.columns, pd.MultiIndex):
    if ("Adj Close", ticker) in data.columns:
        prices = data[("Adj Close", ticker)].dropna()
    elif ("Close", ticker) in data.columns:
        prices = data[("Close", ticker)].dropna()
    else:
        raise ValueError(f"No valid price column found for {ticker} in MultiIndex columns.")
    volume_series = data[("Volume", ticker)].dropna()
else:
    if "Adj Close" in data.columns:
        prices = data["Adj Close"].dropna()
    elif "Close" in data.columns:
        prices = data["Close"].dropna()
    else:
        raise ValueError(f"No valid price column found for {ticker}.")
    volume_series = data["Volume"].dropna()

if len(prices) < 10:
    raise ValueError("Not enough data points to fit the model reliably.")

prices.index = pd.to_datetime(prices.index)
if h_settings["mode"] == "W":
    prices = prices.resample("W-FRI").last().dropna()
elif h_settings["mode"] == "M":
    prices = prices.resample("ME").last().dropna()

exog_df = build_exog(
    prices,
    volume_series,
    h_settings["download_period"],
    h_settings["interval"],
    h_settings["mode"],
    FRED_API_KEY,
)

log_returns = np.log(prices).diff()
log_returns = log_returns.replace([np.inf, -np.inf], np.nan).dropna()
exog_df = exog_df.reindex(log_returns.index).replace([np.inf, -np.inf], np.nan).fillna(0.0)
has_exog = not exog_df.empty and exog_df.shape[1] > 0

log_returns.tail()


In [None]:
if h_settings["mode"] == "D":
    arima_full = ARIMA(log_returns, order=(1, 0, 0)).fit()
    model_type = "arima_fixed"
else:
    arima_full = auto_arima(
        log_returns,
        seasonal=False,
        error_action="ignore",
        suppress_warnings=True,
        stepwise=True,
        trace=False,
    )
    model_type = "auto_arima"

ar1_full = ARIMA(log_returns, order=(1, 0, 0), exog=exog_df if has_exog else None).fit()
drift_full = float(log_returns.mean())

forecast_steps = h_settings["steps"]
if len(prices) > forecast_steps * 2 and len(log_returns) > forecast_steps + 5:
    best_model_name, best_mape, signal_quality = run_backtest(
        log_returns, exog_df, forecast_steps, model_type, prices
    )
else:
    best_model_name, best_mape, signal_quality = "arima", None, "unknown"
    print("Not enough history to run a backtest for this horizon; defaulting to ARIMA.")

best_model_name, best_mape, signal_quality


In [None]:
if h_settings["interval"] == "1d":
    start = prices.index[-1] + pd.offsets.BDay()
    freq = "B"
elif h_settings["interval"] == "1wk":
    start = prices.index[-1] + pd.offsets.Week(weekday=4)
    freq = "W-FRI"
elif h_settings["interval"] == "1mo":
    start = prices.index[-1] + pd.offsets.MonthEnd(1)
    freq = "ME"
else:
    raise ValueError("Unsupported interval period.")

if best_model_name == "arima":
    forecast_returns = forecast_from_arima(arima_full, model_type, forecast_steps)
elif best_model_name == "ar1":
    exog_forecast_full = None
    if has_exog:
        exog_forecast_full = np.vstack([exog_df.iloc[-1].values] * forecast_steps)
    forecast_returns = ar1_full.forecast(steps=forecast_steps, exog=exog_forecast_full).to_numpy().flatten()
elif best_model_name == "xgb":
    forecast_returns = forecast_with_xgb(log_returns, exog_df if has_exog else None, forecast_steps)
    if forecast_returns is None:
        print("XGBoost forecast unavailable; falling back to ARIMA.")
        forecast_returns = forecast_from_arima(arima_full, model_type, forecast_steps)
else:
    forecast_returns = np.full(forecast_steps, drift_full)

cum_returns = np.cumsum(forecast_returns)
last_price = float(prices.iloc[-1])
forecast_prices = last_price * np.exp(cum_returns)
future_dates = pd.date_range(start=start, periods=forecast_steps, freq=freq)
forecast_series = pd.Series(forecast_prices, index=future_dates)

returns_pct = 100 * log_returns.dropna()
garch = arch_model(returns_pct, vol="Garch", p=1, q=1, dist="normal")
garch_fit = garch.fit(disp="off")
garch_forecast = garch_fit.forecast(horizon=forecast_steps)
sigma_forecast = pd.Series(garch_forecast.variance.values[-1, :] ** 0.5, index=forecast_series.index)
sigma_soft = sigma_forecast * 0.8
impact_vol = last_price * (sigma_soft / 100)

hybrid_forecast_upper = forecast_series + 1.96 * impact_vol
hybrid_forecast_lower = forecast_series - 1.96 * impact_vol

sigma_dec = sigma_soft.values / 100.0
shocks = np.random.normal(size=(num_sims, forecast_steps)) * sigma_dec
paths = last_price * np.exp(np.cumsum(forecast_returns + shocks, axis=1))
narrow_percentiles = np.percentile(paths, [10, 50, 90], axis=0)
mc_p10 = pd.Series(narrow_percentiles[0], index=forecast_series.index)
mc_p50 = pd.Series(narrow_percentiles[1], index=forecast_series.index)
mc_p90 = pd.Series(narrow_percentiles[2], index=forecast_series.index)

print("\nForecasted Volatility (Standard Deviation):\n", sigma_forecast)
print("\nMonte Carlo scenarios (log-return AR/GARCH/XGB):")
print(f" Final price 10th percentile: {mc_p10.iloc[-1]:.2f}")
print(f" Final price median         : {mc_p50.iloc[-1]:.2f}")
print(f" Final price 90th percentile: {mc_p90.iloc[-1]:.2f}")


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(prices.index, prices, label="Historical Prices")
plt.plot(
    forecast_series.index,
    forecast_series,
    label=f"Forecasted Prices ({best_model_name.upper()})",
    color="red",
)
plt.title(f"{ticker} Price Forecast")
plt.xlabel("Date")
plt.ylabel("Price")
plt.fill_between(
    hybrid_forecast_upper.index,
    hybrid_forecast_lower,
    hybrid_forecast_upper,
    color="gray",
    alpha=0.3,
    label="95% Confidence Interval",
)
plt.plot(forecast_series.index, hybrid_forecast_upper, linestyle="--", label="Upper Confidence Bound")
plt.plot(forecast_series.index, hybrid_forecast_lower, linestyle="--", label="Lower Confidence Bound")
plt.fill_between(mc_p90.index, mc_p10, mc_p90, color="orange", alpha=0.2, label="MC 10-90%")
plt.plot(mc_p50.index, mc_p50, color="orange", linestyle=":", label="MC Median")
plt.legend()
plt.grid()
plt.show()


In [None]:
initial_price = float(prices.iloc[-1])
final_price = float(forecast_series.iloc[-1])
expected_return = ((final_price - initial_price) / initial_price) * 100
print(f"Expected return over {h_settings['label']}: {expected_return:.2f}%")
if best_mape is not None:
    print(f"Signal quality: {signal_quality.upper()} (backtest MAPE {best_mape:.2f}%)")
else:
    print("Signal quality: UNKNOWN (no backtest window available)")

try:
    ticker_hist = yf.Ticker(ticker).history(period="1d")
    if ticker_hist.empty:
        raise ValueError("empty history")
    current_price = float(ticker_hist["Close"].iloc[-1])
    print(f"Current price of {ticker}: ${current_price:.2f}")
except Exception:
    current_price = float(prices.iloc[-1])
    print(f"Current price of {ticker}: ${current_price:.2f} (fallback from downloaded data)")

if not API_KEY or API_KEY.strip() == "":
    print("\n News feature disabled (no API key provided).\n")
else:
    news_url = f"https://newsapi.org/v2/everything?q={ticker}&apiKey={API_KEY}"
    response = requests.get(news_url)
    if response.status_code == 200:
        news_data = response.json()
        articles = news_data.get("articles", [])
        if articles:
            print(f"\nRecent news articles about {ticker}:\n")
            for article in articles[:5]:
                print(f"Title: {article['title']}")
                print(f"Description: {article['description']}\n")
        else:
            print(f"\nNo recent news articles found for {ticker}.\n")
    else:
        print("\nFailed to fetch news articles (invalid API key or request error).\n")


NameError: name 'prices' is not defined