<a href="https://colab.research.google.com/github/brysonchristensen/IS-4487/blob/main/Stock_Trader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Multi-Signal Investing Copilot (Intraday → 2 Weeks)
Combines technicals + fundamentals + news/social sentiment + attention spikes + narrative summaries into explainable actions.

Cell 1 — Install dependencies

In [38]:
!pip -q install pandas numpy scipy pyarrow fastparquet tqdm ta
!pip -q install matplotlib scikit-learn statsmodels
!pip -q install vaderSentiment beautifulsoup4 lxml requests

# Alpaca Market Data (historical + streaming)
!pip -q install alpaca-py nest_asyncio


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/122.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.5/122.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Cell 2 — Imports + global settings

In [39]:
import os, re, json, time, math, datetime as dt
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import matplotlib.pyplot as plt

Cell 3 — Mount Google Drive + define paths

In [40]:
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/msic"
DATA_DIR = f"{BASE_DIR}/data"
CACHE_DIR = f"{BASE_DIR}/cache"
REPORT_DIR = f"{BASE_DIR}/reports"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

print("Ready:", BASE_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Ready: /content/drive/MyDrive/msic


Cell 4 — Config (single source of truth)

In [41]:
CONFIG = {
    "universe": {
        "include_sp500": True,
        "include_nasdaq100": True,
        "extra_tickers": ["SPY", "QQQ", "IWM", "XLK"],  # regime anchors + sanity checks
        "max_symbols": 650,
    },
    "time": {
        "tz": "America/Denver",
        "bar_sizes": ["1m", "5m", "15m", "1h", "1d"],
        "lookback_days_intraday": 10,   # enough for indicators
        "lookback_days_daily": 365 * 2, # enough for swing/2-week features
    },
    "data": {
        "provider": "yfinance_dev",  # swap later to "alpaca" or other
        "cache_parquet": True,
        "refresh_cache": False,
    },
    "scoring": {
        "horizons": ["intraday", "swing", "twoweek"],
        "weights": {  # horizon blend for final score (tune later)
            "intraday": 0.45,
            "swing": 0.35,
            "twoweek": 0.20,
        },
        "thresholds": {
            "buy": 0.60,
            "sell": -0.60,
            "watch": 0.25,
        }
    },
    "risk": {
        "min_avg_dollar_vol": 20_000_000,   # filter illiquid names
        "max_position_pct": 0.05,
        "max_sector_exposure_pct": 0.25,    # optional later
        "vol_target": 0.20,                # annualized, optional later
    },
    "nlp": {
        "sentiment_model": "vader",  # swap later to finbert/roberta
        "text_window_minutes": [60, 240, 1440],  # 1h, 4h, 1d
    },
    "output": {
        "top_n": 25,
        "save_reports": True,
    }
}

Cell 5 — API keys (environment variables)
(Keep secrets out of notebook outputs.)

In [42]:
# Example:
# os.environ["ALPACA_API_KEY"] = "..."
# os.environ["ALPACA_SECRET_KEY"] = "..."
# os.environ["XAI_API_KEY"] = "..."

def require_env(var: str):
    if not os.getenv(var):
        print(f"Warning: missing env var {var}")

Cell 6 — Logging helpers

In [43]:
def log(msg: str):
    ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts} UTC] {msg}")

Cell 7 — Caching utilities (parquet)

In [44]:
def cache_path(name: str) -> str:
    safe = re.sub(r"[^a-zA-Z0-9_\-]+", "_", name)
    return os.path.join(CACHE_DIR, f"{safe}.parquet")

def save_parquet(df: pd.DataFrame, name: str):
    df.to_parquet(cache_path(name), index=False)

def load_parquet(name: str) -> Optional[pd.DataFrame]:
    path = cache_path(name)
    if os.path.exists(path):
        return pd.read_parquet(path)
    return None

Cell 8 — Universe builder (S&P 500 + Nasdaq-100)

In [10]:
import requests

def load_sp500_tickers() -> List[str]:
    # Wikipedia tables are easy for dev; swap to a more robust source later if needed.
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {"User-Agent": "Mozilla/5.0"} # Mimic a browser to avoid 403 Forbidden
    response = requests.get(url, headers=headers)
    response.raise_for_status() # Raise an exception for HTTP errors
    tables = pd.read_html(response.text)
    df = tables[0]
    return df["Symbol"].astype(str).tolist()

def load_nasdaq100_tickers() -> List[str]:
    url = "https://en.wikipedia.org/wiki/Nasdaq-100"
    headers = {"User-Agent": "Mozilla/5.0"} # Mimic a browser to avoid 403 Forbidden
    response = requests.get(url, headers=headers)
    response.raise_for_status() # Raise an exception for HTTP errors
    tables = pd.read_html(response.text)
    # One of the tables contains tickers; sometimes it changes — inspect if it breaks.
    for t in tables:
        cols = [c.lower() for c in t.columns.astype(str)]
        if any("ticker" in c for c in cols) or any("symbol" in c for c in cols):
            # find the ticker-like column
            col = [c for c in t.columns if "Ticker" in str(c) or "Symbol" in str(c)][0]
            return t[col].astype(str).tolist()
    raise ValueError("Could not find Nasdaq-100 ticker table on page.")

def normalize_ticker(t: str) -> str:
    # Many data APIs use BRK-B instead of BRK.B
    return t.replace(".", "-").strip().upper()

def build_universe(cfg=CONFIG) -> List[str]:
    tickers = []
    if cfg["universe"]["include_sp500"]:
        tickers += load_sp500_tickers()
    if cfg["universe"]["include_nasdaq100"]:
        tickers += load_nasdaq100_tickers()
    tickers += cfg["universe"]["extra_tickers"]
    tickers = [normalize_ticker(t) for t in tickers]
    tickers = sorted(list(set(tickers)))
    return tickers[:cfg["universe"]["max_symbols"]]

universe = build_universe()
log(f"Universe size: {len(universe)}")
universe[:20]

  tables = pd.read_html(response.text)


[2025-12-28 02:17:09 UTC] Universe size: 520


  tables = pd.read_html(response.text)
  ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


['A',
 'AAPL',
 'ABBV',
 'ABNB',
 'ABT',
 'ACGL',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AFL',
 'AIG',
 'AIZ',
 'AJG',
 'AKAM']

Cell 9 — Market data adapter interface

In [11]:
class MarketDataAdapter:
    def fetch_bars(self, symbols: List[str], start: dt.datetime, end: dt.datetime, timeframe: str) -> pd.DataFrame:
        """
        Return standardized bars:
        timestamp, symbol, open, high, low, close, volume
        timeframe: '1m','5m','15m','1h','1d'
        """
        raise NotImplementedError

Cell 10 — Dev adapter (yfinance)

(Not true real-time; good for development + backtesting pipeline.)

In [12]:
import yfinance as yf

class YFinanceAdapter(MarketDataAdapter):
    def fetch_bars(self, symbols, start, end, timeframe):
        interval_map = {"1m":"1m","5m":"5m","15m":"15m","1h":"60m","1d":"1d"}
        interval = interval_map[timeframe]

        # yfinance minute data has limited lookback; keep intraday window small.
        data = yf.download(
            tickers=" ".join(symbols),
            start=start, end=end,
            interval=interval,
            group_by="ticker",
            auto_adjust=False,
            threads=True,
            progress=False
        )

        rows = []
        # yfinance shape differs for single vs multiple tickers
        if isinstance(data.columns, pd.MultiIndex):
            for sym in symbols:
                if sym not in data.columns.levels[0]:
                    continue
                df = data[sym].dropna().copy()
                df["symbol"] = sym
                df = df.rename(columns={"Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume"})
                df["timestamp"] = df.index
                rows.append(df[["timestamp","symbol","open","high","low","close","volume"]])
        else:
            df = data.dropna().copy()
            df["symbol"] = symbols[0]
            df = df.rename(columns={"Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume"})
            df["timestamp"] = df.index
            rows.append(df[["timestamp","symbol","open","high","low","close","volume"]])

        out = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(
            columns=["timestamp","symbol","open","high","low","close","volume"]
        )
        out["timestamp"] = pd.to_datetime(out["timestamp"], utc=True)
        return out

Cell 11 — (Optional) Real-time adapter placeholder (Alpaca)

In [13]:
class AlpacaAdapter(MarketDataAdapter):
    def __init__(self):
        # from alpaca.data.historical import StockHistoricalDataClient
        # self.client = StockHistoricalDataClient(os.getenv("ALPACA_API_KEY"), os.getenv("ALPACA_SECRET_KEY"))
        pass

    def fetch_bars(self, symbols, start, end, timeframe):
        # TODO: implement using alpaca-py historical client
        raise NotImplementedError("Implement Alpaca historical bars here")

    def stream_bars(self, symbols, timeframe="1Min"):
        # TODO: implement websocket streaming
        raise NotImplementedError("Implement Alpaca WebSocket here")

Cell 12 — Choose provider + define fetch windows

In [14]:
provider = CONFIG["data"]["provider"]
adapter = YFinanceAdapter() if provider == "yfinance_dev" else AlpacaAdapter()

now_utc = dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
start_intraday = now_utc - dt.timedelta(days=CONFIG["time"]["lookback_days_intraday"])
start_daily = now_utc - dt.timedelta(days=CONFIG["time"]["lookback_days_daily"])

log(f"Now UTC: {now_utc}")

[2025-12-28 02:18:07 UTC] Now UTC: 2025-12-28 02:18:07.746524+00:00


  now_utc = dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
  ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


Cell 13 — Fetch intraday bars (cached)

In [15]:
def get_bars_cached(symbols: List[str], start: dt.datetime, end: dt.datetime, timeframe: str, refresh=False) -> pd.DataFrame:
    name = f"bars_{timeframe}_{start.date()}_{end.date()}_{len(symbols)}"
    if CONFIG["data"]["cache_parquet"] and not refresh:
        cached = load_parquet(name)
        if cached is not None:
            log(f"Loaded cache: {name}")
            return cached

    log(f"Fetching {timeframe} bars for {len(symbols)} symbols...")
    df = adapter.fetch_bars(symbols, start, end, timeframe)

    if CONFIG["data"]["cache_parquet"]:
        save_parquet(df, name)
    return df

# Start with a smaller subset while building:
symbols_dev = universe[:150]  # increase later
bars_5m = get_bars_cached(symbols_dev, start_intraday, now_utc, "5m", refresh=CONFIG["data"]["refresh_cache"])
bars_5m.head()

  ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


[2025-12-28 02:18:17 UTC] Fetching 5m bars for 150 symbols...


Price,timestamp,symbol,open,high,low,close,volume
0,2025-12-18 14:30:00+00:00,A,137.600006,138.520004,137.565002,138.330002,46341.0
1,2025-12-18 14:35:00+00:00,A,138.259995,138.779999,138.029999,138.744995,22202.0
2,2025-12-18 14:40:00+00:00,A,138.910004,138.979996,138.229996,138.259995,20685.0
3,2025-12-18 14:45:00+00:00,A,138.199997,138.315002,138.039993,138.039993,7020.0
4,2025-12-18 14:50:00+00:00,A,138.039993,138.5,138.039993,138.169998,12899.0


Cell 14 — Fetch daily bars (cached)

In [16]:
bars_1d = get_bars_cached(symbols_dev, start_daily, now_utc, "1d", refresh=CONFIG["data"]["refresh_cache"])
bars_1d.head()

  ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


[2025-12-28 02:19:17 UTC] Fetching 1d bars for 150 symbols...


Price,timestamp,symbol,open,high,low,close,volume
0,2023-12-29 00:00:00+00:00,A,139.070007,139.699997,138.360001,139.029999,1014400
1,2024-01-02 00:00:00+00:00,A,138.190002,140.589996,137.910004,138.75,1441600
2,2024-01-03 00:00:00+00:00,A,138.0,138.0,131.070007,131.160004,2074500
3,2024-01-04 00:00:00+00:00,A,130.550003,131.5,130.190002,131.0,2446600
4,2024-01-05 00:00:00+00:00,A,130.0,131.960007,128.619995,130.559998,1394000


Cell 15 — Technical indicator functions (vectorized)

In [17]:
import ta

def add_ta_features(df: pd.DataFrame, timeframe: str) -> pd.DataFrame:
    """
    df: standardized bars
    returns: df with TA columns
    """
    out = []
    for sym, g in df.groupby("symbol", sort=False):
        g = g.sort_values("timestamp").copy()
        close = g["close"]
        high = g["high"]
        low = g["low"]
        vol = g["volume"]

        g[f"{timeframe}_rsi14"] = ta.momentum.rsi(close, window=14, fillna=True)
        g[f"{timeframe}_roc10"] = ta.momentum.roc(close, window=10, fillna=True)
        macd = ta.trend.MACD(close, fillna=True)
        g[f"{timeframe}_macd"] = macd.macd()
        g[f"{timeframe}_macd_hist"] = macd.macd_diff()

        bb = ta.volatility.BollingerBands(close, window=20, window_dev=2, fillna=True)
        g[f"{timeframe}_bbp"] = bb.bollinger_pband()  # 0..1-ish
        g[f"{timeframe}_atr14"] = ta.volatility.average_true_range(high, low, close, window=14, fillna=True)

        g[f"{timeframe}_vol_z"] = (vol - vol.rolling(50).mean()) / (vol.rolling(50).std() + 1e-9)
        g[f"{timeframe}_ret1"] = close.pct_change().fillna(0.0)

        out.append(g)
    return pd.concat(out, ignore_index=True)

bars_5m_feat = add_ta_features(bars_5m, "5m")
bars_1d_feat = add_ta_features(bars_1d, "1d")

Cell 16 — Build multi-timeframe feature snapshots

(For ranking “now”, you want the latest feature row per symbol per timeframe.)

In [18]:
def latest_snapshot(df: pd.DataFrame) -> pd.DataFrame:
    idx = df.sort_values("timestamp").groupby("symbol")["timestamp"].idxmax()
    snap = df.loc[idx].copy()
    return snap

snap_5m = latest_snapshot(bars_5m_feat)
snap_1d = latest_snapshot(bars_1d_feat)

# Merge (outer join to tolerate missing)
features = snap_5m.merge(snap_1d, on="symbol", how="outer", suffixes=("", "_1ddup"))
features = features.drop(columns=[c for c in features.columns if c.endswith("_1ddup")], errors="ignore")
features.head()

Price,timestamp,symbol,open,high,low,close,volume,5m_rsi14,5m_roc10,5m_macd,...,5m_vol_z,5m_ret1,1d_rsi14,1d_roc10,1d_macd,1d_macd_hist,1d_bbp,1d_atr14,1d_vol_z,1d_ret1
0,2025-12-26 20:55:00+00:00,A,138.460007,138.559998,138.380005,138.389999,107660.0,62.36659,0.144726,0.076477,...,6.518394,-0.000361,39.677658,-3.135715,-2.30228,-0.460942,0.296487,2.853705,-1.446163,0.000506
1,2025-12-26 20:55:00+00:00,AAPL,273.049988,273.459991,272.880005,273.25,1116844.0,24.765276,-0.346467,-0.331778,...,6.030944,0.000879,49.11917,-1.66529,0.194261,-0.978415,0.289862,4.480211,-1.339017,-0.001497
2,2025-12-26 20:55:00+00:00,ABBV,229.820007,230.339996,229.735001,230.039993,188500.0,75.579309,0.344599,0.169809,...,6.658312,0.001044,55.966942,2.678811,0.397241,0.57151,0.886984,4.345683,-1.484245,0.000391
3,2025-12-26 20:55:00+00:00,ABNB,136.899994,137.0,136.779999,136.820007,153573.0,59.20383,0.117082,0.103534,...,6.549437,-0.000438,75.212475,6.890631,4.300187,0.978287,0.815361,2.934501,-1.631925,0.000293
4,2025-12-26 20:55:00+00:00,ABT,124.760002,124.870003,124.699997,124.849998,219868.0,67.08806,0.12029,0.044203,...,6.513362,0.000641,46.327493,1.26541,-0.528812,0.061203,0.443129,2.253961,-1.66821,0.00024


Cell 17 — Signal normalization helpers

In [19]:
def clip01(x): return float(np.clip(x, 0.0, 1.0))
def clip11(x): return float(np.clip(x, -1.0, 1.0))

def z_to_unit(z, zcap=3.0):
    z = float(np.clip(z, -zcap, zcap))
    return z / zcap  # -> [-1,1]

Cell 18 — Intraday signals (example set)

In [20]:
def signal_intraday_momentum(row) -> Tuple[float, float, List[str]]:
    # Example: combine RSI + MACD hist + ROC
    rsi = row.get("5m_rsi14", np.nan)
    macd_h = row.get("5m_macd_hist", np.nan)
    roc = row.get("5m_roc10", np.nan)

    reasons = []
    if pd.isna(rsi) or pd.isna(macd_h) or pd.isna(roc):
        return 0.0, 0.0, ["missing intraday features"]

    # map RSI: 50 neutral, >70 overbought, <30 oversold (simple)
    rsi_component = clip11((rsi - 50) / 20)  # 50->0, 70->+1, 30->-1
    macd_component = clip11(z_to_unit(macd_h))
    roc_component = clip11(z_to_unit(roc * 100))  # scale

    value = clip11(0.45*rsi_component + 0.35*macd_component + 0.20*roc_component)

    reasons += [f"RSI14={rsi:.1f}", f"MACD_hist={macd_h:.4f}", f"ROC10={roc:.3%}"]
    confidence = clip01(0.7)  # upgrade later based on vol regime/data freshness
    return value, confidence, reasons

def signal_vwap_reversion_placeholder(row) -> Tuple[float, float, List[str]]:
    # Placeholder until you compute VWAP distance.
    return 0.0, 0.0, ["VWAP reversion not implemented"]

def signal_intraday_volatility(row) -> Tuple[float, float, List[str]]:
    atr = row.get("5m_atr14", np.nan)
    volz = row.get("5m_vol_z", np.nan)
    if pd.isna(atr) or pd.isna(volz):
        return 0.0, 0.0, ["missing volatility features"]

    # High vol & high volume tends to support breakouts (context-dependent)
    value = clip11(0.5*z_to_unit(volz) + 0.5*z_to_unit(atr))
    confidence = clip01(0.6)
    return value, confidence, [f"ATR14={atr:.4f}", f"VolZ={volz:.2f}"]

Cell 19 — Swing + 2-week signals (daily)

In [21]:
def signal_daily_trend(row) -> Tuple[float, float, List[str]]:
    # Cheap trend proxy: MACD hist + RSI
    rsi = row.get("1d_rsi14", np.nan)
    macd_h = row.get("1d_macd_hist", np.nan)
    if pd.isna(rsi) or pd.isna(macd_h):
        return 0.0, 0.0, ["missing daily features"]

    value = clip11(0.6*clip11((rsi-50)/20) + 0.4*z_to_unit(macd_h))
    confidence = clip01(0.65)
    return value, confidence, [f"1D RSI14={rsi:.1f}", f"1D MACD_hist={macd_h:.4f}"]

def signal_daily_mean_reversion(row) -> Tuple[float, float, List[str]]:
    bbp = row.get("1d_bbp", np.nan)  # 0..1 band position
    rsi = row.get("1d_rsi14", np.nan)
    if pd.isna(bbp) or pd.isna(rsi):
        return 0.0, 0.0, ["missing mean reversion features"]

    # If price near bottom band + RSI low -> bullish mean reversion
    value = clip11((0.5 - bbp) * 2)  # bbp 0 => +1, bbp 1 => -1
    value = clip11(0.7*value + 0.3*clip11((50-rsi)/20))
    confidence = clip01(0.55)
    return value, confidence, [f"BBP={bbp:.2f}", f"RSI14={rsi:.1f}"]

Cell 20 — Compute signals table

In [22]:
def compute_signals(features_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in features_df.iterrows():
        sym = row["symbol"]

        intr_val, intr_conf, intr_reas = signal_intraday_momentum(row)
        vol_val, vol_conf, vol_reas = signal_intraday_volatility(row)

        swing_val, swing_conf, swing_reas = signal_daily_trend(row)
        tw_val, tw_conf, tw_reas = signal_daily_mean_reversion(row)  # placeholder "2-week" logic

        rows.append({
            "symbol": sym,
            "intraday_value": intr_val,
            "intraday_conf": intr_conf,
            "intraday_reasons": intr_reas,

            "intraday_vol_value": vol_val,
            "intraday_vol_conf": vol_conf,
            "intraday_vol_reasons": vol_reas,

            "swing_value": swing_val,
            "swing_conf": swing_conf,
            "swing_reasons": swing_reas,

            "twoweek_value": tw_val,
            "twoweek_conf": tw_conf,
            "twoweek_reasons": tw_reas,
        })
    return pd.DataFrame(rows)

signals = compute_signals(features)
signals.head()

Unnamed: 0,symbol,intraday_value,intraday_conf,intraday_reasons,intraday_vol_value,intraday_vol_conf,intraday_vol_reasons,swing_value,swing_conf,swing_reasons,twoweek_value,twoweek_conf,twoweek_reasons
0,A,0.480819,0.7,"[RSI14=62.4, MACD_hist=0.0220, ROC10=14.473%]",0.517224,0.6,"[ATR14=0.1033, VolZ=6.52]",-0.371129,0.65,"[1D RSI14=39.7, 1D MACD_hist=-0.4609]",0.439753,0.55,"[BBP=0.30, RSI14=39.7]"
1,AAPL,-0.663147,0.7,"[RSI14=24.8, MACD_hist=-0.1127, ROC10=-34.647%]",0.54104,0.6,"[ATR14=0.2462, VolZ=6.03]",-0.15688,0.65,"[1D RSI14=49.1, 1D MACD_hist=-0.9784]",0.307405,0.55,"[BBP=0.29, RSI14=49.1]"
2,ABBV,0.658733,0.7,"[RSI14=75.6, MACD_hist=0.0749, ROC10=34.460%]",0.534488,0.6,"[ATR14=0.2069, VolZ=6.66]",0.25521,0.65,"[1D RSI14=56.0, 1D MACD_hist=0.5715]",-0.631281,0.55,"[BBP=0.89, RSI14=56.0]"
3,ABNB,0.408711,0.7,"[RSI14=59.2, MACD_hist=0.0139, ROC10=11.708%]",0.520364,0.6,"[ATR14=0.1222, VolZ=6.55]",0.730438,0.65,"[1D RSI14=75.2, 1D MACD_hist=0.9783]",-0.741505,0.55,"[BBP=0.82, RSI14=75.2]"
4,ABT,0.586987,0.7,"[RSI14=67.1, MACD_hist=0.0215, ROC10=12.029%]",0.513578,0.6,"[ATR14=0.0815, VolZ=6.51]",-0.102015,0.65,"[1D RSI14=46.3, 1D MACD_hist=0.0612]",0.134706,0.55,"[BBP=0.44, RSI14=46.3]"


Cell 21 — Ensemble scoring + action mapping + explanations

In [23]:
W = CONFIG["scoring"]["weights"]
TH = CONFIG["scoring"]["thresholds"]

def combine_score(row) -> Tuple[float, float, List[str]]:
    # Combine values and confidences
    intr = row["intraday_value"]
    swing = row["swing_value"]
    tw = row["twoweek_value"]

    score = W["intraday"]*intr + W["swing"]*swing + W["twoweek"]*tw
    # confidence = weighted conf * agreement factor
    conf = W["intraday"]*row["intraday_conf"] + W["swing"]*row["swing_conf"] + W["twoweek"]*row["twoweek_conf"]

    # Agreement: if signs disagree strongly, reduce confidence
    signs = np.sign([intr, swing, tw])
    agreement = (np.abs(signs.sum()) / 3.0)  # 1.0 if all same sign, 0.33 if mixed
    conf = clip01(conf * agreement)

    # Build explanation
    reasons = []
    reasons += [f"intraday={intr:+.2f} ({', '.join(row['intraday_reasons'][:2])})"]
    reasons += [f"swing={swing:+.2f} ({', '.join(row['swing_reasons'][:2])})"]
    reasons += [f"2wk={tw:+.2f} ({', '.join(row['twoweek_reasons'][:2])})"]
    return float(score), float(conf), reasons

def action_from(score, conf) -> str:
    if conf < 0.25:
        return "AVOID"
    if score >= TH["buy"]:
        return "BUY"
    if score <= TH["sell"]:
        return "SELL"
    if abs(score) >= TH["watch"]:
        return "WATCH"
    return "AVOID"

dec_rows = []
for _, r in signals.iterrows():
    score, conf, reasons = combine_score(r)
    dec_rows.append({
        "symbol": r["symbol"],
        "score": score,
        "confidence": conf,
        "action": action_from(score, conf),
        "reasons": reasons
    })

decisions = pd.DataFrame(dec_rows).sort_values("score", ascending=False)
decisions.head(10)

Unnamed: 0,symbol,score,confidence,action,reasons
30,AMP,0.397448,0.2175,AVOID,"[intraday=+0.58 (RSI14=66.0, MACD_hist=0.1450)..."
107,COF,0.39091,0.2175,AVOID,"[intraday=+0.66 (RSI14=73.9, MACD_hist=0.0718)..."
18,AJG,0.377363,0.2175,AVOID,"[intraday=+0.66 (RSI14=70.4, MACD_hist=0.0437)..."
138,DG,0.377236,0.2175,AVOID,"[intraday=+0.65 (RSI14=81.5, MACD_hist=0.0360)..."
70,BMY,0.358129,0.2175,AVOID,"[intraday=+0.65 (RSI14=76.5, MACD_hist=0.0178)..."
79,CAH,0.348355,0.2175,AVOID,"[intraday=+0.65 (RSI14=71.3, MACD_hist=0.0355)..."
16,AIG,0.348211,0.2175,AVOID,"[intraday=+0.55 (RSI14=65.7, MACD_hist=0.0069)..."
92,CFG,0.340963,0.2175,AVOID,"[intraday=+0.61 (RSI14=68.1, MACD_hist=0.0110)..."
7,ADBE,0.335866,0.2175,AVOID,"[intraday=+0.61 (RSI14=68.0, MACD_hist=0.0658)..."
136,DECK,0.331915,0.2175,AVOID,"[intraday=+0.65 (RSI14=74.7, MACD_hist=0.0335)..."


Cell 22 — Display: top longs / shorts

In [24]:
TOPN = CONFIG["output"]["top_n"]

top_longs = decisions[decisions["action"].isin(["BUY","WATCH"])].head(TOPN)
top_shorts = decisions.sort_values("score").query("action in ['SELL','WATCH']").head(TOPN)

display(top_longs[["symbol","score","confidence","action"]].head(15))
display(top_shorts[["symbol","score","confidence","action"]].head(15))

Unnamed: 0,symbol,score,confidence,action
37,APD,0.299985,0.6525,WATCH
147,DOW,0.294161,0.6525,WATCH


Unnamed: 0,symbol,score,confidence,action
147,DOW,0.294161,0.6525,WATCH
37,APD,0.299985,0.6525,WATCH


Cell 23 — Print explanations for a few symbols

In [25]:
def show_explanations(df, n=5):
    for _, row in df.head(n).iterrows():
        print(f"\n{row['symbol']}  score={row['score']:.2f}  conf={row['confidence']:.2f}  action={row['action']}")
        for s in row["reasons"]:
            print("  -", s)

show_explanations(top_longs, n=5)


APD  score=0.30  conf=0.65  action=WATCH
  - intraday=+0.62 (RSI14=68.4, MACD_hist=0.0982)
  - swing=+0.04 (1D RSI14=48.9, 1D MACD_hist=0.5586)
  - 2wk=+0.02 (BBP=0.50, RSI14=48.9)

DOW  score=0.29  conf=0.65  action=WATCH
  - intraday=+0.60 (RSI14=67.9, MACD_hist=0.0030)
  - swing=+0.02 (1D RSI14=50.9, 1D MACD_hist=-0.0581)
  - 2wk=+0.08 (BBP=0.43, RSI14=50.9)


Text / Sentiment pipeline (News, Reddit, Stocktwits, Grok)
Cell 24 — Define standardized text schema

In [26]:
TEXT_COLS = ["timestamp","symbol","source","text","url","author","meta"]

def make_text_df(rows: List[dict]) -> pd.DataFrame:
    df = pd.DataFrame(rows)
    for c in TEXT_COLS:
        if c not in df.columns:
            df[c] = None
    df = df[TEXT_COLS].copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
    df["symbol"] = df["symbol"].astype(str).str.upper()
    return df.dropna(subset=["timestamp","symbol","text"])

Cell 25 — Text adapters (stubs you’ll fill in)

In [27]:
def fetch_news(symbols: List[str], start: dt.datetime, end: dt.datetime) -> pd.DataFrame:
    # TODO: integrate a news provider. Return make_text_df([...])
    return make_text_df([])

def fetch_reddit(symbols: List[str], start: dt.datetime, end: dt.datetime) -> pd.DataFrame:
    # TODO: integrate Reddit API (OAuth). Return make_text_df([...])
    return make_text_df([])

def fetch_stocktwits(symbols: List[str], start: dt.datetime, end: dt.datetime) -> pd.DataFrame:
    # TODO: integrate Stocktwits API. Return make_text_df([...])
    return make_text_df([])

Cell 26 — Clean + dedupe text

In [28]:
def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", str(s)).strip()
    return s

def dedupe_text(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["text_clean"] = df["text"].map(clean_text)
    df["hash"] = (df["symbol"] + "|" + df["source"].astype(str) + "|" + df["text_clean"]).map(lambda x: hash(x))
    df = df.drop_duplicates("hash").drop(columns=["hash"])
    return df

# Example pipeline:
text_df = pd.concat([
    fetch_news(symbols_dev, now_utc - dt.timedelta(days=2), now_utc),
    fetch_reddit(symbols_dev, now_utc - dt.timedelta(days=2), now_utc),
    fetch_stocktwits(symbols_dev, now_utc - dt.timedelta(days=2), now_utc),
], ignore_index=True)

text_df = dedupe_text(text_df)
text_df.head()

Unnamed: 0,timestamp,symbol,source,text,url,author,meta,text_clean


Cell 27 — Sentiment scoring (baseline VADER)

In [29]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text: str) -> float:
    # returns compound in [-1, 1]
    return analyzer.polarity_scores(text)["compound"]

def add_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    df["sentiment"] = df["text_clean"].map(vader_sentiment)
    return df

text_df = add_sentiment(text_df)
text_df.head()

Unnamed: 0,timestamp,symbol,source,text,url,author,meta,text_clean


Cell 28 — Attention + sentiment features per symbol

In [30]:
def build_text_features(text_df: pd.DataFrame, now: dt.datetime, windows_min=[60,240,1440]) -> pd.DataFrame:
    rows = []
    if text_df.empty:
        return pd.DataFrame(columns=["symbol"] + [f"sent_{w}" for w in windows_min] + [f"attn_{w}" for w in windows_min])

    for sym, g in text_df.groupby("symbol"):
        rec = {"symbol": sym}
        for w in windows_min:
            start = now - dt.timedelta(minutes=w)
            gw = g[g["timestamp"] >= start]
            rec[f"attn_{w}"] = len(gw)
            rec[f"sent_{w}"] = float(gw["sentiment"].mean()) if len(gw) else 0.0
            rec[f"sentchg_{w}"] = float(gw["sentiment"].mean() - g["sentiment"].mean()) if len(gw) else 0.0
        rows.append(rec)
    return pd.DataFrame(rows)

text_features = build_text_features(text_df, now_utc, CONFIG["nlp"]["text_window_minutes"])
text_features.head()

Unnamed: 0,symbol,sent_60,sent_240,sent_1440,attn_60,attn_240,attn_1440


Cell 29 — Sentiment shock signal (plug into ensemble)

In [31]:
def signal_sentiment_shock(row) -> Tuple[float, float, List[str]]:
    # Example: 1h sentiment + attention spike drives intraday signal
    sent_60 = row.get("sent_60", 0.0)
    attn_60 = row.get("attn_60", 0.0)
    sentchg_60 = row.get("sentchg_60", 0.0)

    # Map attention to [0,1] confidence boost
    conf = clip01(min(1.0, attn_60 / 20.0))

    # Value: sentiment + change
    value = clip11(0.7*sent_60 + 0.3*clip11(sentchg_60))
    reasons = [f"sent_60={sent_60:+.2f}", f"attn_60={attn_60}", f"sentchg_60={sentchg_60:+.2f}"]
    return value, conf, reasons

# Merge text features into your feature snapshot:
features2 = features.merge(text_features, on="symbol", how="left").fillna(0.0)

  features2 = features.merge(text_features, on="symbol", how="left").fillna(0.0)


Cell 30 — Recompute signals with sentiment included

In [32]:
def compute_signals_with_sentiment(features_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, row in features_df.iterrows():
        sym = row["symbol"]

        intr_val, intr_conf, intr_reas = signal_intraday_momentum(row)
        vol_val, vol_conf, vol_reas = signal_intraday_volatility(row)
        sent_val, sent_conf, sent_reas = signal_sentiment_shock(row)

        # Intraday composite (momentum + vol context + sentiment shock)
        intr_comb = clip11(0.55*intr_val + 0.20*vol_val + 0.25*sent_val)
        intr_comb_conf = clip01(0.5*intr_conf + 0.2*vol_conf + 0.3*sent_conf)

        swing_val, swing_conf, swing_reas = signal_daily_trend(row)
        tw_val, tw_conf, tw_reas = signal_daily_mean_reversion(row)

        rows.append({
            "symbol": sym,
            "intraday_value": intr_comb,
            "intraday_conf": intr_comb_conf,
            "intraday_reasons": intr_reas + ["| SENT"] + sent_reas,

            "swing_value": swing_val,
            "swing_conf": swing_conf,
            "swing_reasons": swing_reas,

            "twoweek_value": tw_val,
            "twoweek_conf": tw_conf,
            "twoweek_reasons": tw_reas,
        })
    return pd.DataFrame(rows)

signals2 = compute_signals_with_sentiment(features2)

Cell 31 — Final decisions again

In [33]:
dec_rows = []
for _, r in signals2.iterrows():
    score, conf, reasons = combine_score(r)
    dec_rows.append({
        "symbol": r["symbol"],
        "score": score,
        "confidence": conf,
        "action": action_from(score, conf),
        "reasons": reasons
    })

decisions2 = pd.DataFrame(dec_rows).sort_values("score", ascending=False)
display(decisions2.head(20)[["symbol","score","confidence","action"]])

Unnamed: 0,symbol,score,confidence,action
30,AMP,0.333239,0.183,AVOID
107,COF,0.306017,0.183,AVOID
18,AJG,0.294234,0.183,AVOID
138,DG,0.291977,0.183,AVOID
16,AIG,0.282063,0.183,AVOID
70,BMY,0.271427,0.183,AVOID
82,CB,0.269026,0.183,AVOID
69,BLK,0.265655,0.183,AVOID
92,CFG,0.26351,0.183,AVOID
79,CAH,0.263199,0.183,AVOID


Grok narrative module (optional)
Cell 32 — Grok prompt builder (API call stub)

In [34]:
def build_grok_prompt(symbol: str, recent_text: pd.DataFrame) -> str:
    # Keep it short; include a few headlines/posts and ask for stance + risks
    samples = recent_text.sort_values("timestamp", ascending=False).head(12)
    bullets = "\n".join([f"- [{r['source']}] {r['text_clean'][:240]}" for _, r in samples.iterrows()])
    prompt = f"""
You are an investing analyst. For {symbol}, summarize the current narrative and sentiment.

Return JSON with keys:
- stance: one of ["bullish","bearish","mixed","unclear"]
- summary: 2-4 sentences
- key_risks: list of 3 bullets
- catalysts: list of 3 bullets

Context:
{bullets}
""".strip()
    return prompt

def grok_call(prompt: str) -> dict:
    # TODO: call xAI API with your key; return parsed JSON
    # return {"stance":"unclear","summary":"...","key_risks":[...],"catalysts":[...]}
    return {}

# Example usage (top 3 symbols):
for sym in decisions2.head(3)["symbol"].tolist():
    recent = text_df[text_df["symbol"] == sym]
    prompt = build_grok_prompt(sym, recent) if not recent.empty else None
    print(sym, "prompt ready" if prompt else "no text available")

AMP no text available
COF no text available
AJG no text available


Backtesting (minimal but correct direction)
Cell 33 — Label creation (next-day / next-5d / next-10d returns)

In [35]:
def make_forward_returns(daily_bars: pd.DataFrame, horizons=[1,5,10]) -> pd.DataFrame:
    out = []
    for sym, g in daily_bars.groupby("symbol", sort=False):
        g = g.sort_values("timestamp").copy()
        for h in horizons:
            g[f"fret_{h}d"] = g["close"].shift(-h) / g["close"] - 1
        out.append(g[["timestamp","symbol"] + [f"fret_{h}d" for h in horizons]])
    return pd.concat(out, ignore_index=True)

daily_rets = make_forward_returns(bars_1d_feat, horizons=[1,5,10])
daily_rets.head()

Price,timestamp,symbol,fret_1d,fret_5d,fret_10d
0,2023-12-29 00:00:00+00:00,A,-0.002014,-0.040639,-0.061138
1,2024-01-02 00:00:00+00:00,A,-0.054703,-0.058162,-0.076613
2,2024-01-03 00:00:00+00:00,A,-0.00122,-0.000534,-0.005337
3,2024-01-04 00:00:00+00:00,A,-0.003359,-0.010076,0.001679
4,2024-01-05 00:00:00+00:00,A,0.021599,-0.000153,0.015242


Cell 34 — Simple walk-forward backtest skeleton (ranking-based)

In [36]:
def backtest_rank_strategy(daily_bars: pd.DataFrame, features_daily: pd.DataFrame) -> pd.DataFrame:
    """
    Skeleton:
    - for each date, compute score using only info up to that date
    - pick top N longs and bottom N shorts
    - hold for K days (e.g., 5)
    """
    # TODO: implement properly with no lookahead.
    return pd.DataFrame()

# Placeholder: you’ll implement once your daily feature frame is standardized.

Cell 35 — Save “daily brief” report

In [37]:
def save_brief(decisions_df: pd.DataFrame, filename: str):
    path = os.path.join(REPORT_DIR, filename)
    decisions_df.to_csv(path, index=False)
    log(f"Saved: {path}")

if CONFIG["output"]["save_reports"]:
    stamp = dt.datetime.utcnow().strftime("%Y%m%d_%H%M")
    save_brief(decisions2.head(200), f"brief_{stamp}.csv")

[2025-12-28 02:25:27 UTC] Saved: /content/drive/MyDrive/msic/reports/brief_20251228_0225.csv


  stamp = dt.datetime.utcnow().strftime("%Y%m%d_%H%M")
  ts = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
