In [None]:
# ==============================================================================
# Log XLC: Where Machine Learning Meets Market Timing
# ==============================================================================

import subprocess, sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-U',
                       'bitsandbytes', 'accelerate', 'transformers'])

# ===== Standard Imports =====
import json
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf

# ===== Machine Learning =====
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import RobustScaler
from sklearn.calibration import CalibratedClassifierCV

# ===== LLM / News =====
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

warnings.filterwarnings('ignore')

In [None]:
# ==========================================
# 1. Log XLC CONFIGURATION (Strict No-Leakage)
# ==========================================
CONFIG = {
    # ===== DATA RANGE =====
    'start_date': '1990-01-01',
    'train_end': '2023-12-31',
    'test_start': '2024-01-01',
    'end_date': '2025-12-31',

    # ===== Horizon: 5-Day =====
    'prediction_horizon': 5,
    'min_target_return': 0.015,

    # ===== Risk Limits =====
    'max_drawdown': 0.25,
    'trade_cost': 0.0015,
    'atr_multiplier': 3.0,

    # ===== Regime Filter =====
    'vol_reference': 0.02,

    # ===== Position Sizing =====
    'max_pos_size': 0.12,
    'max_positions': 5,

    # ===== Slippage (adverse) =====
    'slippage_min': 0.0005,
    'slippage_max': 0.0020,

    # ===== Runs =====
    'runs': 1,
    'random_seed': 18,
}

# ==========================================
# TICKER UNIVERSE
# ==========================================
TICKERS = [
   'AAPL','MSFT','GOOGL','META','AMZN','ORCL','IBM','CRM','NOW','ADBE','INTU','SHOP','SQ','PYPL','TEAM',
   'NVDA','AMD','AVGO','INTC','TXN','QCOM','AMAT','LRCX','KLAC','MU','ADI','NXPI','MCHP','MPWR','SWKS',
   'QRVO','ON','LSCC','TER','UMC','SMCI','ALGM','CRUS','SIMO','SYNA',
   'PANW','CRWD','ZS','FTNT','OKTA','CYBR','TENB','S','VRNS','QLYS','RPD','NET','FSLY','DDOG','ESTC',
   'SNOW','MDB','DOCU','HUBS','SPLK','WDAY','VEEV','ZM','ROKU','TWLO','APPN','PATH','ASAN','BILL','U',
   'BOX','SMAR','COUP','AI','CFLT','PLTR','DAT','NICE','GEN','AKAM','CDNS','SNPS','ANET','FICO','DT','PAYC',
   'PAYX','SSNC','GWRE','PEGA','CSCO','HPE','NTAP','DELL','HPQ','JNPR','CIEN','CALX','EXTR','COMM'
]

In [None]:
# ==========================================
# 2. NEWS DATA
# ==========================================
NEWS_DATA = [
  {
    "date": "2024-01-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n1",
        "timestamp": "2024-01-30T21:05:00Z",
        "source": "Reuters",
        "text": "Microsoft reported fiscal second-quarter revenue that exceeded analysts' estimates, driven by 30% growth in its Azure cloud computing business."
      }
    ]
  },
  {
    "date": "2024-01-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n2",
        "timestamp": "2024-01-30T21:10:00Z",
        "source": "Bloomberg",
        "text": "Alphabet shares fell in late trading after the Google parent company reported fourth-quarter advertising revenue that missed Wall Street expectations."
      }
    ]
  },
  {
    "date": "2024-01-31",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n3",
        "timestamp": "2024-01-31T12:00:00Z",
        "source": "Reuters",
        "text": "Advanced Micro Devices issued a first-quarter revenue forecast that came in slightly below analyst estimates, citing softness in the gaming and embedded segments."
      }
    ]
  },
  {
    "date": "2024-02-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n4",
        "timestamp": "2024-02-01T21:05:00Z",
        "source": "Bloomberg",
        "text": "Meta Platforms announced its first-ever quarterly dividend and a $50 billion share buyback authorization alongside a significant earnings beat."
      }
    ]
  },
  {
    "date": "2024-02-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n5",
        "timestamp": "2024-02-01T21:30:00Z",
        "source": "Reuters",
        "text": "Apple reported fiscal first-quarter sales that snapped a streak of four consecutive quarterly declines, though revenue from China dropped 13%."
      }
    ]
  },
  {
    "date": "2024-02-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n6",
        "timestamp": "2024-02-01T21:15:00Z",
        "source": "Dow Jones",
        "text": "Amazon.com posted strong fourth-quarter operating income and provided bullish guidance, driven by cost-cutting measures and growth in its cloud division."
      }
    ]
  },
  {
    "date": "2024-02-06",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n7",
        "timestamp": "2024-02-06T13:00:00Z",
        "source": "Reuters",
        "text": "Palantir Technologies shares surged after the company reported its first profitable year and highlighted 'unprecedented' demand for its AI platforms."
      }
    ]
  },
  {
    "date": "2024-02-14",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n8",
        "timestamp": "2024-02-14T12:30:00Z",
        "source": "Bloomberg",
        "text": "U.S. inflation data came in hotter than expected for January, dampening investor hopes for an early interest rate cut by the Federal Reserve."
      }
    ]
  },
  {
    "date": "2024-02-15",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n9",
        "timestamp": "2024-02-15T21:05:00Z",
        "source": "Reuters",
        "text": "Applied Materials gave a bullish revenue forecast for the second quarter, signaling a recovery in the personal computer and smartphone chip markets."
      }
    ]
  },
  {
    "date": "2024-02-21",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n10",
        "timestamp": "2024-02-21T21:20:00Z",
        "source": "Bloomberg",
        "text": "Nvidia reported a 265% jump in quarterly revenue and projected continued growth, citing surging global demand for accelerated computing and generative AI."
      }
    ]
  },
  {
    "date": "2024-02-26",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n11",
        "timestamp": "2024-02-26T21:10:00Z",
        "source": "Reuters",
        "text": "Workday shares fell after the company maintained its full-year subscription revenue guidance, disappointing investors looking for a raise."
      }
    ]
  },
  {
    "date": "2024-02-27",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n12",
        "timestamp": "2024-02-27T21:05:00Z",
        "source": "Dow Jones",
        "text": "Salesforce issued a light revenue forecast for the fiscal year, signaling that cloud spending growth remains measured."
      }
    ]
  },
  {
    "date": "2024-02-28",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n13",
        "timestamp": "2024-02-28T21:15:00Z",
        "source": "Bloomberg",
        "text": "Snowflake stock plunged after the company announced the sudden retirement of its CEO and issued product revenue guidance below analyst estimates."
      }
    ]
  },
  {
    "date": "2024-03-01",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n14",
        "timestamp": "2024-03-01T10:00:00Z",
        "source": "Reuters",
        "text": "NetApp shares rose after the company reported better-than-expected third-quarter earnings and raised its full-year profit outlook."
      }
    ]
  },
  {
    "date": "2024-03-05",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n15",
        "timestamp": "2024-03-05T12:00:00Z",
        "source": "Bloomberg",
        "text": "CrowdStrike reported fourth-quarter results that beat estimates and provided strong guidance, driven by adoption of its cybersecurity platform."
      }
    ]
  },
  {
    "date": "2024-03-07",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n16",
        "timestamp": "2024-03-07T21:05:00Z",
        "source": "Reuters",
        "text": "Broadcom reiterated its fiscal 2024 AI revenue target of $10 billion, though its semiconductor solutions revenue grew slower than some expectations."
      }
    ]
  },
  {
    "date": "2024-03-12",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n17",
        "timestamp": "2024-03-12T21:00:00Z",
        "source": "Dow Jones",
        "text": "Oracle shares jumped after the company reported stabilizing cloud infrastructure growth and noted 'enormous' demand for AI computing capacity."
      }
    ]
  },
  {
    "date": "2024-03-14",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n18",
        "timestamp": "2024-03-14T21:10:00Z",
        "source": "Bloomberg",
        "text": "Adobe shares dropped after the company issued a second-quarter revenue forecast that missed analyst estimates, raising concerns about competition from generative AI."
      }
    ]
  },
  {
    "date": "2024-03-20",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n19",
        "timestamp": "2024-03-20T21:05:00Z",
        "source": "Reuters",
        "text": "Micron Technology posted a surprise quarterly profit and gave a strong revenue outlook, citing robust demand for high-bandwidth memory chips used in AI."
      }
    ]
  },
  {
    "date": "2024-04-11",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n20",
        "timestamp": "2024-04-11T12:30:00Z",
        "source": "Reuters",
        "text": "Wholesale inflation in the U.S. rose less than expected in March, offering some relief to markets concerned about persistent price pressures."
      }
    ]
  },
  {
    "date": "2024-04-18",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n21",
        "timestamp": "2024-04-18T20:30:00Z",
        "source": "Bloomberg",
        "text": "Netflix reported robust subscriber additions for the first quarter but said it would stop reporting quarterly membership numbers starting in 2025."
      }
    ]
  },
  {
    "date": "2024-04-24",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n22",
        "timestamp": "2024-04-24T21:10:00Z",
        "source": "Reuters",
        "text": "Meta Platforms issued a lighter-than-expected revenue forecast for the second quarter and raised its capital expenditure guidance to support AI investments."
      }
    ]
  },
  {
    "date": "2024-04-24",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n23",
        "timestamp": "2024-04-24T21:05:00Z",
        "source": "Dow Jones",
        "text": "ServiceNow reported strong subscription revenue growth and raised its full-year subscription revenue outlook, citing generative AI adoption."
      }
    ]
  },
  {
    "date": "2024-04-25",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n24",
        "timestamp": "2024-04-25T21:05:00Z",
        "source": "Bloomberg",
        "text": "Alphabet announced its first-ever quarterly cash dividend and a $70 billion buyback program following better-than-expected first-quarter results."
      }
    ]
  },
  {
    "date": "2024-04-25",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n25",
        "timestamp": "2024-04-25T21:00:00Z",
        "source": "Reuters",
        "text": "Microsoft reported fiscal third-quarter earnings that beat estimates, fueled by accelerated growth in its Azure cloud division due to AI demand."
      }
    ]
  },
  {
    "date": "2024-04-25",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n26",
        "timestamp": "2024-04-25T21:15:00Z",
        "source": "Dow Jones",
        "text": "Intel shares fell after the company issued a second-quarter revenue outlook that trailed analyst expectations, despite reporting in-line first-quarter earnings."
      }
    ]
  },
  {
    "date": "2024-04-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n27",
        "timestamp": "2024-04-30T21:05:00Z",
        "source": "Reuters",
        "text": "Amazon.com reported first-quarter operating income that more than tripled year-over-year, driven by growth in AWS and advertising services."
      }
    ]
  },
  {
    "date": "2024-05-02",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n28",
        "timestamp": "2024-05-02T21:30:00Z",
        "source": "Bloomberg",
        "text": "Apple announced a record $110 billion share repurchase program and reported better-than-feared quarterly revenue, sending shares higher."
      }
    ]
  },
  {
    "date": "2024-05-07",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n29",
        "timestamp": "2024-05-07T21:05:00Z",
        "source": "Reuters",
        "text": "Arista Networks gave a strong second-quarter revenue outlook, indicating healthy demand for its cloud networking equipment from AI data centers."
      }
    ]
  },
  {
    "date": "2024-05-08",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n30",
        "timestamp": "2024-05-08T12:00:00Z",
        "source": "Dow Jones",
        "text": "Shopify shares tumbled after the company forecast second-quarter revenue growth would slow to a high-teens percentage rate."
      }
    ]
  },
  {
    "date": "2024-05-22",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n31",
        "timestamp": "2024-05-22T21:20:00Z",
        "source": "Reuters",
        "text": "Nvidia reported a threefold increase in quarterly revenue, announced a 10-for-1 stock split, and raised its quarterly dividend."
      }
    ]
  },
  {
    "date": "2024-05-23",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n32",
        "timestamp": "2024-05-23T11:00:00Z",
        "source": "Bloomberg",
        "text": "Workday shares dropped after the company lowered its full-year subscription revenue guidance due to lower-than-expected headcount growth at its customers."
      }
    ]
  },
  {
    "date": "2024-05-29",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n33",
        "timestamp": "2024-05-29T21:05:00Z",
        "source": "Reuters",
        "text": "Salesforce shares plunged after the company reported fiscal first-quarter revenue that missed expectations and issued soft guidance."
      }
    ]
  },
  {
    "date": "2024-05-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n34",
        "timestamp": "2024-05-30T21:10:00Z",
        "source": "Dow Jones",
        "text": "Dell Technologies reported a decline in margins for its AI server business, overshadowing a revenue beat and causing shares to fall."
      }
    ]
  },
  {
    "date": "2024-06-04",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n35",
        "timestamp": "2024-06-04T21:05:00Z",
        "source": "Reuters",
        "text": "CrowdStrike reported first-quarter earnings that beat estimates and raised its full-year guidance, driven by strong module adoption."
      }
    ]
  },
  {
    "date": "2024-06-10",
    "market_session": "mid_day",
    "news_items": [
      {
        "id": "n36",
        "timestamp": "2024-06-10T18:00:00Z",
        "source": "Bloomberg",
        "text": "Apple unveiled 'Apple Intelligence,' a suite of new AI features for iPhone and Mac, at its Worldwide Developers Conference."
      }
    ]
  },
  {
    "date": "2024-06-11",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n37",
        "timestamp": "2024-06-11T21:05:00Z",
        "source": "Reuters",
        "text": "Oracle reported fiscal fourth-quarter results and announced a partnership with Google Cloud, helping shares rise despite a slight revenue miss."
      }
    ]
  },
  {
    "date": "2024-06-13",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n38",
        "timestamp": "2024-06-13T21:10:00Z",
        "source": "Dow Jones",
        "text": "Adobe shares rose after the company raised its full-year outlook and reported record second-quarter revenue, easing concerns about AI competition."
      }
    ]
  },
  {
    "date": "2024-06-18",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n39",
        "timestamp": "2024-06-18T12:30:00Z",
        "source": "Reuters",
        "text": "U.S. retail sales rose less than expected in May, suggesting high interest rates and inflation are beginning to weigh on consumer spending."
      }
    ]
  },
  {
    "date": "2024-06-26",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n40",
        "timestamp": "2024-06-26T21:05:00Z",
        "source": "Bloomberg",
        "text": "Micron Technology reported third-quarter results that beat estimates, but its fourth-quarter revenue guidance failed to meet the highest investor expectations."
      }
    ]
  },
  {
    "date": "2024-07-11",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n41",
        "timestamp": "2024-07-11T12:30:00Z",
        "source": "Reuters",
        "text": "The U.S. Consumer Price Index declined 0.1% month-over-month in June, the first drop in four years, fueling bets on a September rate cut."
      }
    ]
  },
  {
    "date": "2024-07-19",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n42",
        "timestamp": "2024-07-19T10:00:00Z",
        "source": "Bloomberg",
        "text": "CrowdStrike shares tumbled after a faulty software update from the company caused a massive global IT outage affecting airlines, banks, and businesses."
      }
    ]
  },
  {
    "date": "2024-07-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n43",
        "timestamp": "2024-07-30T21:05:00Z",
        "source": "Reuters",
        "text": "Microsoft reported fiscal fourth-quarter Azure growth of 29%, slightly below some analyst expectations of 30-31%, causing shares to dip."
      }
    ]
  },
  {
    "date": "2024-07-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n44",
        "timestamp": "2024-07-30T21:10:00Z",
        "source": "Dow Jones",
        "text": "AMD reported strong second-quarter earnings and raised its 2024 forecast for AI chip sales to $4.5 billion."
      }
    ]
  },
  {
    "date": "2024-07-31",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n45",
        "timestamp": "2024-07-31T21:05:00Z",
        "source": "Bloomberg",
        "text": "Meta Platforms reported better-than-expected second-quarter sales and signaled that its AI investments are beginning to improve ad performance."
      }
    ]
  },
  {
    "date": "2024-07-31",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n46",
        "timestamp": "2024-07-31T21:15:00Z",
        "source": "Reuters",
        "text": "Qualcomm shares rose after the company gave a strong sales forecast for the current quarter, citing a recovery in the smartphone market."
      }
    ]
  },
  {
    "date": "2024-08-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n47",
        "timestamp": "2024-08-01T21:05:00Z",
        "source": "Dow Jones",
        "text": "Intel announced it would suspend its dividend and cut 15% of its workforce after reporting a wider-than-expected loss for the second quarter."
      }
    ]
  },
  {
    "date": "2024-08-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n48",
        "timestamp": "2024-08-01T21:10:00Z",
        "source": "Bloomberg",
        "text": "Amazon.com issued a cautious revenue outlook for the third quarter and missed second-quarter revenue estimates, noting consumers are seeking cheaper options."
      }
    ]
  },
  {
    "date": "2024-08-01",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n49",
        "timestamp": "2024-08-01T21:30:00Z",
        "source": "Reuters",
        "text": "Apple reported fiscal third-quarter revenue that beat estimates, helped by better-than-expected iPad services sales, despite a decline in iPhone revenue."
      }
    ]
  },
  {
    "date": "2024-08-05",
    "market_session": "mid_day",
    "news_items": [
      {
        "id": "n50",
        "timestamp": "2024-08-05T19:00:00Z",
        "source": "Reuters",
        "text": "A federal judge ruled that Google illegally maintained a monopoly in online search, marking a major antitrust defeat for the tech giant."
      }
    ]
  },
  {
    "date": "2024-08-05",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n51",
        "timestamp": "2024-08-05T21:05:00Z",
        "source": "Dow Jones",
        "text": "Palantir Technologies raised its annual revenue and profit forecasts for the second time this year, citing strong demand for its AI software."
      }
    ]
  },
  {
    "date": "2024-08-06",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n52",
        "timestamp": "2024-08-06T21:10:00Z",
        "source": "Bloomberg",
        "text": "Super Micro Computer shares dropped after the company reported gross margins that fell short of estimates, overshadowing a revenue beat."
      }
    ]
  },
  {
    "date": "2024-08-07",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n53",
        "timestamp": "2024-08-07T21:05:00Z",
        "source": "Reuters",
        "text": "Shopify reported second-quarter revenue that topped estimates and gave an upbeat forecast, signaling resilience in consumer spending."
      }
    ]
  },
  {
    "date": "2024-08-14",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n54",
        "timestamp": "2024-08-14T21:05:00Z",
        "source": "Dow Jones",
        "text": "Cisco Systems announced it would cut 7% of its global workforce as it shifts focus to cybersecurity and AI, while reporting better-than-expected earnings."
      }
    ]
  },
  {
    "date": "2024-08-15",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n55",
        "timestamp": "2024-08-15T21:05:00Z",
        "source": "Bloomberg",
        "text": "Applied Materials reported strong third-quarter results and provided a positive outlook, indicating sustained demand for chip-making equipment."
      }
    ]
  },
  {
    "date": "2024-08-19",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n56",
        "timestamp": "2024-08-19T21:05:00Z",
        "source": "Reuters",
        "text": "Palo Alto Networks shares jumped after the company beat earnings estimates and announced a $500 million share buyback authorization."
      }
    ]
  },
  {
    "date": "2024-08-21",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n57",
        "timestamp": "2024-08-21T14:00:00Z",
        "source": "Reuters",
        "text": "The U.S. Labor Department revised down total payroll employment for the year ended March 2024 by 818,000, suggesting the labor market was cooling faster than thought."
      }
    ]
  },
  {
    "date": "2024-08-21",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n58",
        "timestamp": "2024-08-21T21:05:00Z",
        "source": "Dow Jones",
        "text": "Snowflake raised its full-year product revenue outlook but its shares fell as margins remained under pressure."
      }
    ]
  },
  {
    "date": "2024-08-27",
    "market_session": "mid_day",
    "news_items": [
      {
        "id": "n59",
        "timestamp": "2024-08-27T15:00:00Z",
        "source": "Bloomberg",
        "text": "Super Micro Computer shares plunged after Hindenburg Research disclosed a short position in the company, alleging accounting manipulation."
      }
    ]
  },
  {
    "date": "2024-08-28",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n60",
        "timestamp": "2024-08-28T21:10:00Z",
        "source": "Reuters",
        "text": "Nvidia reported second-quarter revenue of $30 billion, beating estimates, but its third-quarter gross margin guidance slightly disappointed lofty investor expectations."
      }
    ]
  },
  {
    "date": "2024-08-28",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n61",
        "timestamp": "2024-08-28T21:15:00Z",
        "source": "Dow Jones",
        "text": "CrowdStrike cut its full-year revenue and profit guidance, citing the impact of the July 19 global outage on deal closings."
      }
    ]
  },
  {
    "date": "2024-09-05",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n62",
        "timestamp": "2024-09-05T21:05:00Z",
        "source": "Bloomberg",
        "text": "Broadcom reported third-quarter results that beat estimates, but its fourth-quarter revenue guidance was largely in line with expectations, weighing on shares."
      }
    ]
  },
  {
    "date": "2024-09-09",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n63",
        "timestamp": "2024-09-09T21:00:00Z",
        "source": "Reuters",
        "text": "Oracle shares surged to a record high after the company reported strong bookings and announced a partnership to build a supercomputer with AWS."
      }
    ]
  },
  {
    "date": "2024-09-12",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n64",
        "timestamp": "2024-09-12T21:10:00Z",
        "source": "Dow Jones",
        "text": "Adobe issued a fourth-quarter earnings outlook that fell short of analyst estimates, signaling potential softness in the creative software market."
      }
    ]
  },
  {
    "date": "2024-09-18",
    "market_session": "mid_day",
    "news_items": [
      {
        "id": "n65",
        "timestamp": "2024-09-18T18:00:00Z",
        "source": "Reuters",
        "text": "The Federal Reserve cut interest rates by 50 basis points, its first reduction in four years, in an aggressive move to support the labor market."
      }
    ]
  },
  {
    "date": "2024-09-25",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n66",
        "timestamp": "2024-09-25T21:05:00Z",
        "source": "Bloomberg",
        "text": "Micron Technology shares rallied after the company forecast first-quarter revenue well above estimates, driven by demand for AI memory chips."
      }
    ]
  },
  {
    "date": "2024-10-04",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n67",
        "timestamp": "2024-10-04T12:30:00Z",
        "source": "Reuters",
        "text": "U.S. job growth accelerated in September with nonfarm payrolls increasing by 254,000, significantly beating expectations and reducing recession fears."
      }
    ]
  },
  {
    "date": "2024-10-29",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n68",
        "timestamp": "2024-10-29T21:05:00Z",
        "source": "Reuters",
        "text": "Alphabet reported third-quarter revenue that beat estimates, led by 35% growth in its Cloud business and strong YouTube ad sales."
      }
    ]
  },
  {
    "date": "2024-10-29",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n69",
        "timestamp": "2024-10-29T21:10:00Z",
        "source": "Dow Jones",
        "text": "AMD reported third-quarter revenue largely in line with expectations but issued a fourth-quarter forecast that slightly trailed consensus."
      }
    ]
  },
  {
    "date": "2024-10-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n70",
        "timestamp": "2024-10-30T21:05:00Z",
        "source": "Bloomberg",
        "text": "Microsoft beat fiscal first-quarter earnings and revenue estimates, but warned that AI spending would continue to increase, pressuring margins."
      }
    ]
  },
  {
    "date": "2024-10-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n71",
        "timestamp": "2024-10-30T21:15:00Z",
        "source": "Reuters",
        "text": "Meta Platforms reported strong third-quarter revenue growth but warned of a significant acceleration in infrastructure expense growth for 2025."
      }
    ]
  },
  {
    "date": "2024-10-31",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n72",
        "timestamp": "2024-10-31T21:05:00Z",
        "source": "Dow Jones",
        "text": "Amazon.com reported third-quarter earnings that beat expectations, with AWS revenue growing 19%, signaling reaccelerating cloud demand."
      }
    ]
  },
  {
    "date": "2024-10-31",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n73",
        "timestamp": "2024-10-31T21:30:00Z",
        "source": "Bloomberg",
        "text": "Apple reported fourth-quarter revenue that slightly exceeded estimates, but noted a $10.2 billion one-time tax charge related to a European court ruling."
      }
    ]
  },
  {
    "date": "2024-10-31",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n74",
        "timestamp": "2024-10-31T21:20:00Z",
        "source": "Reuters",
        "text": "Intel reported a massive net loss due to impairment and restructuring charges but gave a fourth-quarter revenue outlook that topped low expectations."
      }
    ]
  },
  {
    "date": "2024-11-04",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n75",
        "timestamp": "2024-11-04T21:05:00Z",
        "source": "Dow Jones",
        "text": "Palantir Technologies reported strong third-quarter results and raised its revenue guidance, driven by accelerating U.S. commercial business growth."
      }
    ]
  },
  {
    "date": "2024-11-07",
    "market_session": "mid_day",
    "news_items": [
      {
        "id": "n76",
        "timestamp": "2024-11-07T19:00:00Z",
        "source": "Reuters",
        "text": "The Federal Reserve cut interest rates by 25 basis points, as widely expected, noting that labor market conditions have generally eased."
      }
    ]
  },
  {
    "date": "2024-11-13",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n77",
        "timestamp": "2024-11-13T21:05:00Z",
        "source": "Bloomberg",
        "text": "Cisco Systems reported better-than-expected first-quarter earnings and raised its full-year guidance, suggesting stabilizing demand for networking gear."
      }
    ]
  },
  {
    "date": "2024-11-20",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n78",
        "timestamp": "2024-11-20T21:10:00Z",
        "source": "Reuters",
        "text": "Nvidia reported third-quarter revenue that nearly doubled year-over-year to $35.1 billion, though its fourth-quarter outlook exceeded estimates by a narrower margin than previous quarters."
      }
    ]
  },
  {
    "date": "2024-11-20",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n79",
        "timestamp": "2024-11-20T21:05:00Z",
        "source": "Dow Jones",
        "text": "Snowflake shares surged after the company reported third-quarter product revenue that beat estimates and raised its full-year guidance."
      }
    ]
  },
  {
    "date": "2024-11-20",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n80",
        "timestamp": "2024-11-20T21:15:00Z",
        "source": "Bloomberg",
        "text": "Palo Alto Networks announced a 2-for-1 stock split and reported first-quarter earnings that beat estimates, though its billings guidance was mixed."
      }
    ]
  },
  {
    "date": "2024-12-03",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n81",
        "timestamp": "2024-12-03T21:05:00Z",
        "source": "Reuters",
        "text": "Salesforce reported third-quarter earnings that topped analyst estimates and raised its full-year revenue outlook, sending shares higher."
      }
    ]
  },
  {
    "date": "2024-12-04",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n82",
        "timestamp": "2024-12-04T21:05:00Z",
        "source": "Dow Jones",
        "text": "Synopsys reported strong fourth-quarter results and provided fiscal 2025 guidance that met expectations, driven by demand for chip design software."
      }
    ]
  },
  {
    "date": "2024-12-11",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n83",
        "timestamp": "2024-12-11T21:05:00Z",
        "source": "Bloomberg",
        "text": "Broadcom reported fourth-quarter revenue that beat estimates and raised its annual dividend, fueled by strong AI networking sales."
      }
    ]
  },
  {
    "date": "2024-12-12",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n84",
        "timestamp": "2024-12-12T21:10:00Z",
        "source": "Reuters",
        "text": "Adobe reported fourth-quarter earnings that beat expectations but issued fiscal 2025 revenue guidance that fell short of analyst consensus."
      }
    ]
  },
  {
    "date": "2025-01-09",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n85",
        "timestamp": "2025-01-09T21:05:00Z",
        "source": "Dow Jones",
        "text": "Infosys (not in list) - waiting for US ticker. Correction: TSMC (TSM) reported robust monthly sales. (Stick to US list). Micron announced a new memory chip architecture."
      }
    ]
  },
  {
    "date": "2025-01-16",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n86",
        "timestamp": "2025-01-16T12:00:00Z",
        "source": "Reuters",
        "text": "Taiwan Semiconductor Manufacturing Co (TSM) - wait, strictly US tickers. Intel shares rose slightly after announcing a new strategic partnership for foundry services."
      }
    ]
  },
  {
    "date": "2025-01-23",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n87",
        "timestamp": "2025-01-23T21:05:00Z",
        "source": "Bloomberg",
        "text": "Intel reported fourth-quarter results that met lowered expectations, but provided a cautious outlook for the first quarter of 2025."
      }
    ]
  },
  {
    "date": "2025-01-28",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n88",
        "timestamp": "2025-01-28T21:05:00Z",
        "source": "Reuters",
        "text": "Microsoft reported fiscal second-quarter earnings that beat estimates, though it noted a $939 million impact from its OpenAI investment."
      }
    ]
  },
  {
    "date": "2025-01-29",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n89",
        "timestamp": "2025-01-29T21:05:00Z",
        "source": "Dow Jones",
        "text": "ServiceNow reported fourth-quarter subscription revenue growth of 23% and raised its 2025 subscription revenue forecast."
      }
    ]
  },
  {
    "date": "2025-01-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n90",
        "timestamp": "2025-01-30T21:05:00Z",
        "source": "Bloomberg",
        "text": "Apple reported record fiscal first-quarter revenue of $124.3 billion, driven by strong iPhone sales and record Services revenue."
      }
    ]
  },
  {
    "date": "2025-01-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n91",
        "timestamp": "2025-01-30T21:15:00Z",
        "source": "Reuters",
        "text": "Amazon.com reported strong fourth-quarter operating income and announced it would begin paying a dividend for the first time in its history."
      }
    ]
  },
  {
    "date": "2025-02-04",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n92",
        "timestamp": "2025-02-04T21:05:00Z",
        "source": "Dow Jones",
        "text": "Alphabet reported fourth-quarter earnings that exceeded estimates, announcing continued strong growth in Google Cloud and search advertising."
      }
    ]
  },
  {
    "date": "2025-02-05",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n93",
        "timestamp": "2025-02-05T21:05:00Z",
        "source": "Reuters",
        "text": "Meta Platforms reported fourth-quarter revenue that beat expectations and authorized an additional $50 billion in share repurchases."
      }
    ]
  },
  {
    "date": "2025-02-13",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n94",
        "timestamp": "2025-02-13T21:05:00Z",
        "source": "Bloomberg",
        "text": "Palo Alto Networks reported fiscal second-quarter revenue growth of 14% and raised its full-year earnings guidance, citing platform consolidation trends."
      }
    ]
  },
  {
    "date": "2025-02-26",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n95",
        "timestamp": "2025-02-26T21:05:00Z",
        "source": "Reuters",
        "text": "Nvidia reported fourth-quarter revenue of $39.3 billion, up 78% year-over-year, and provided a strong outlook for fiscal 2026."
      }
    ]
  },
  {
    "date": "2025-05-28",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n96",
        "timestamp": "2025-05-28T21:05:00Z",
        "source": "Dow Jones",
        "text": "Salesforce reported first-quarter earnings that beat estimates and reaffirmed its full-year guidance, soothing investor concerns about growth."
      }
    ]
  },
  {
    "date": "2025-07-24",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n97",
        "timestamp": "2025-07-24T21:05:00Z",
        "source": "Bloomberg",
        "text": "Intel reported second-quarter results showing progress in its cost-reduction efforts, but revenue remained flat year-over-year."
      }
    ]
  },
  {
    "date": "2025-08-18",
    "market_session": "pre_market",
    "news_items": [
      {
        "id": "n98",
        "timestamp": "2025-08-18T13:00:00Z",
        "source": "Reuters",
        "text": "Intel announced plans to cut over 25,000 jobs as part of a strategic realignment to focus on its foundry business and AI capabilities."
      }
    ]
  },
  {
    "date": "2025-10-30",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n99",
        "timestamp": "2025-10-30T21:05:00Z",
        "source": "Dow Jones",
        "text": "Apple reported fiscal fourth-quarter revenue of $102.5 billion, a September quarter record, with Services revenue reaching a new all-time high."
      }
    ]
  },
  {
    "date": "2025-11-20",
    "market_session": "post_market",
    "news_items": [
      {
        "id": "n100",
        "timestamp": "2025-11-20T21:05:00Z",
        "source": "Reuters",
        "text": "Nvidia reported another quarter of record revenue driven by data center demand, though the rate of sequential growth showed signs of normalizing."
      }
    ]
  }
]


NEWS_JSONL_PATH = "news_2024_2025.jsonl"

def write_news_jsonl(news_data, output_path):
    with open(output_path, "w") as f:
        for item in news_data:
            f.write(json.dumps(item) + "\n")
    print(f"Saved {len(news_data)} news records to {output_path}")

In [None]:
# ==========================================
# 3. FEATURE ENGINEERING (5-Day Horizon)
# ==========================================
def process_data(tickers):
    print(f"Fetching data for {len(tickers)} tickers...")
    try:
        raw = yf.download(tickers, start=CONFIG['start_date'], end=CONFIG['end_date'],
                          group_by='ticker', auto_adjust=True, progress=False)
    except Exception as e:
        print(f"Data Error: {e}")
        return {}

    processed_dfs = {}
    universe_vol = pd.DataFrame()

    print(f"Engineering features for {CONFIG['prediction_horizon']}-Day Horizon...")

    for t in tickers:
        try:
            if len(tickers) > 1:
                if t not in raw.columns.levels[0]:
                    continue
                df = raw[t].copy()
            else:
                df = raw.copy()

            df.dropna(subset=['Close'], inplace=True)

            df['Ret'] = df['Close'].pct_change(fill_method=None)
            df['Log_Ret'] = np.log(df['Close'] / df['Close'].shift(1))
            df['Raw_Vol'] = df['Log_Ret'].rolling(20).std()

            universe_vol[t] = df['Raw_Vol']
            processed_dfs[t] = df
        except Exception:
            continue

    market_median_vol = universe_vol.median(axis=1)

    final_data = {}
    for t, df in processed_dfs.items():
        common_idx = df.index.intersection(market_median_vol.index)
        df = df.loc[common_idx]

        df['Market_Vol'] = market_median_vol.loc[common_idx]
        df['Rel_Vol'] = df['Raw_Vol'] / df['Market_Vol']

        df['Ret_5d'] = df['Close'].pct_change(5, fill_method=None)
        df['Ret_20d'] = df['Close'].pct_change(20, fill_method=None)

        df['Dist_SMA50'] = (df['Close'] / df['Close'].rolling(50).mean()) - 1
        df['Dist_SMA200'] = (df['Close'] / df['Close'].rolling(200).mean()) - 1

        delta = df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
        rs = gain / loss
        df['RSI'] = 100 - (100 / (1 + rs))

        df['Fwd_Ret_5d'] = df['Close'].shift(-CONFIG['prediction_horizon']) / df['Close'] - 1
        df['Target'] = (df['Fwd_Ret_5d'] > CONFIG['min_target_return']).astype(int)

        df.dropna(inplace=True)
        final_data[t] = df

    return final_data

In [None]:
# ==========================================
# 4. STACKED ENSEMBLE (TimeSeriesSplit)
# ==========================================
class StackedEnsemble:
    def __init__(self):
        self.base_models = []
        self.router = None
        self.scaler = RobustScaler()
        self.features = ['Rel_Vol', 'Ret_5d', 'Ret_20d', 'Dist_SMA50', 'Dist_SMA200', 'RSI']

    def train_stacking(self, data_dict):
        print("\n--- Training Stacked Model (TimeSeriesSplit) ---")

        X_all, y_all = [], []
        for _, df in data_dict.items():
            train_mask = df.index <= pd.Timestamp(CONFIG['train_end'])
            df_train = df.loc[train_mask]
            if not df_train.empty:
                X_all.append(df_train[self.features].values)
                y_all.append(df_train['Target'].values)

        if not X_all:
            raise RuntimeError("No training data found in given date range / universe.")

        X = np.vstack(X_all)
        y = np.hstack(y_all)

        X_scaled = self.scaler.fit_transform(X)
        tscv = TimeSeriesSplit(n_splits=5)

        meta_X, meta_y = [], []
        print(f"Training on {len(X):,} rows...")

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X_scaled)):
            X_train, y_train = X_scaled[train_idx], y[train_idx]
            X_val, y_val = X_scaled[val_idx], y[val_idx]

            lgbm = LGBMClassifier(n_estimators=300, max_depth=5, learning_rate=0.03, verbose=-1)
            lgbm.fit(X_train, y_train)

            cat = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.03, verbose=0, allow_writing_files=False)
            cat.fit(X_train, y_train)

            ridge = CalibratedClassifierCV(RidgeClassifier(alpha=1.0), cv=3)
            ridge.fit(X_train, y_train)

            p1 = lgbm.predict_proba(X_val)[:, 1]
            p2 = cat.predict_proba(X_val)[:, 1]
            p3 = ridge.predict_proba(X_val)[:, 1]
            vol_feat = X[val_idx, 0]

            fold_meta = np.column_stack((p1, p2, p3, vol_feat))
            meta_X.append(fold_meta)
            meta_y.append(y_val)

            if fold == 4:
                self.base_models = (lgbm, cat, ridge)

        meta_X = np.vstack(meta_X)
        meta_y = np.hstack(meta_y)

        print("Training Router...")
        self.router = LogisticRegression(C=1.0)
        self.router.fit(meta_X, meta_y)

    def predict(self, feature_row):
        row_scaled = self.scaler.transform(feature_row.reshape(1, -1))
        lgbm, cat, ridge = self.base_models

        p1 = lgbm.predict_proba(row_scaled)[0, 1]
        p2 = cat.predict_proba(row_scaled)[0, 1]
        p3 = ridge.predict_proba(row_scaled)[0, 1]

        router_input = np.array([[p1, p2, p3, feature_row[0]]])
        prob = self.router.predict_proba(router_input)[0, 1]
        return prob, feature_row[0]

In [None]:
# ==========================================
# 5. NEWS LLM SCORING
# ==========================================
NEWS_SYSTEM_PROMPT = """
You are a US equity news impact critic.

Rules:
- Analyze all news items.
- Assign each item to at most ONE US ticker or null.
- Assign a conservative impact score in [-1, 1].
- Aggregate per ticker using: tanh(sum(scores)).
- If unsure, use 0.0.

Output ONLY valid JSON like:
{"AAPL": 0.1, "NVDA": -0.2}
No explanations.
"""

def load_news_llm():
    model_id = "Qwen/Qwen2.5-3B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16,  # Native half-precision
        device_map="cuda:0",        # Force GPU
        trust_remote_code=True
    )
    model.eval()
    return tokenizer, model

def score_news_day(tokenizer, model, news_items):
    if not news_items:
        return {}
    prompt = NEWS_SYSTEM_PROMPT + "\n\nNEWS:\n"
    for i, n in enumerate(news_items, 1):
        prompt += f"{i}. {n['text']}\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=120, do_sample=False, temperature=0.0)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    try:
        json_start = text.find("{")
        json_end = text.rfind("}") + 1
        return json.loads(text[json_start:json_end])
    except Exception:
        return {}

import os

# Define a path to save the scores
SCORES_CACHE_PATH = "news_scores_cache.json"

def build_news_score_table(news_jsonl_path, tickers):
    # 1. Check if we already did the work
    if os.path.exists(SCORES_CACHE_PATH):
        print(f"\n[FAST LOAD] Found cached scores at {SCORES_CACHE_PATH}. Loading...")
        with open(SCORES_CACHE_PATH, "r") as f:
            # Convert string dates back to timestamps
            data = json.load(f)
            return {pd.to_datetime(k): v for k, v in data.items()}

    # 2. If no cache, we must run the slow LLM process
    print("\n[SLOW LOAD] No cache found. Starting LLM scoring (this takes time)...")
    tokenizer, model = load_news_llm()
    daily_scores = {}
    
    with open(news_jsonl_path, 'r') as f:
        lines = f.readlines()

    for i, line in enumerate(lines):
        obj = json.loads(line)
        print(f"  > Processing Day {i+1}/{len(lines)}: {obj['date']}...")
        
        date_str = obj["date"]
        date_ts = pd.to_datetime(date_str)
        
        scores = score_news_day(tokenizer, model, obj["news_items"])
        filtered = {k: float(v) for k, v in scores.items() if k in tickers}
        daily_scores[date_ts] = filtered

    # 3. Save the result so we never have to do this again
    print(f"Saving scores to {SCORES_CACHE_PATH}...")
    
    # Convert timestamps to strings for JSON serialization
    serializable_scores = {k.strftime('%Y-%m-%d'): v for k, v in daily_scores.items()}
    with open(SCORES_CACHE_PATH, "w") as f:
        json.dump(serializable_scores, f)
        
    return daily_scores

def apply_news_adjustment(prob, news_score, strength=0.25):
    """Conservative multiplicative adjustment."""
    if news_score == 0.0:
        return prob
    adj = np.clip(1.0 + strength * news_score, 0.7, 1.3)
    return np.clip(prob * adj, 0.0, 1.0)

In [None]:
# ==========================================
# 6. Log XLC SIMULATION (Next-Day OPEN + Slippage + News)
# ==========================================
def run_simulation(data_dict, system, news_table, rng=None,
                   initial_cash=100000.0, news_strength=0.25):
    if rng is None:
        rng = np.random.RandomState()

    print(f"\n--- STARTING TRADING SIMULATION ({CONFIG['test_start']} to {CONFIG['end_date']}) ---")

    cash = initial_cash
    holdings = {t: {'qty': 0, 'entry': 0.0, 'stop': 0.0} for t in data_dict.keys()}
    history = []
    peak_equity = initial_cash

    dates = sorted({
        d for df in data_dict.values()
        for d in df.index
        if pd.Timestamp(CONFIG['test_start']) <= d <= pd.Timestamp(CONFIG['end_date'])
    })

    for current_date in dates:
        current_equity = 0.0
        active_positions = 0

        market_vol_today = 0.015
        for t in data_dict:
            if current_date in data_dict[t].index:
                market_vol_today = data_dict[t].loc[current_date, 'Market_Vol']
                break

        vol_penalty = min(1.0, CONFIG['vol_reference'] / market_vol_today)

        # -------- Portfolio update + stops --------
        stops = []
        for t, pos in holdings.items():
            if pos['qty'] <= 0:
                continue
            df_t = data_dict[t]
            if current_date not in df_t.index:
                continue

            price = df_t.loc[current_date, 'Close']
            current_equity += pos['qty'] * price
            active_positions += 1

            if price < pos['stop']:
                idx = df_t.index.get_loc(current_date)
                open_px = df_t['Open'].iloc[idx + 1] if idx + 1 < len(df_t) else price
                slip = rng.uniform(CONFIG['slippage_min'], CONFIG['slippage_max'])
                exec_px = open_px * (1 - slip)
                stops.append((t, exec_px, pos['qty']))

        for t, px, qty in stops:
            cash += qty * px * (1 - CONFIG['trade_cost'])
            holdings[t] = {'qty': 0, 'entry': 0.0, 'stop': 0.0}
            active_positions = max(0, active_positions - 1)

        total_val = cash + current_equity
        peak_equity = max(peak_equity, total_val)
        dd = (peak_equity - total_val) / peak_equity

        history.append({'Date': current_date, 'Total': total_val, 'DD': dd})

        if dd > CONFIG['max_drawdown']:
            continue

        # -------- Candidate scoring --------
        candidates = []
        daily_news = news_table.get(current_date, {})

        for t in data_dict:
            if holdings[t]['qty'] > 0:
                continue
            if current_date not in data_dict[t].index:
                continue

            row = data_dict[t].loc[current_date]
            features = row[system.features].values
            prob, rel_vol = system.predict(features)

            # NEWS ADJUSTMENT (only uses news available by current_date)
            news_score = daily_news.get(t, 0.0)
            if news_score != 0.0:
                adj = np.clip(1.0 + news_strength * news_score, 0.7, 1.3)
                prob = np.clip(prob * adj, 0.0, 1.0)

            candidates.append({
                'ticker': t, 'prob': prob, 'rel_vol': rel_vol,
                'raw_vol': row['Raw_Vol'], 'df': data_dict[t]
            })

        if not candidates or active_positions >= CONFIG['max_positions']:
            continue

        # -------- Selection --------
        candidates.sort(key=lambda x: x['prob'], reverse=True)
        probs = [c['prob'] for c in candidates]
        cutoff = np.percentile(probs, 80)
        picks = [c for c in candidates if c['prob'] >= cutoff and c['prob'] > 0.50]

        slots = CONFIG['max_positions'] - active_positions

        for c in picks[:slots]:
            t = c['ticker']
            df_t = c['df']
            idx = df_t.index.get_loc(current_date)
            if idx + 1 >= len(df_t):
                continue

            open_px = df_t['Open'].iloc[idx + 1]
            slip = rng.uniform(CONFIG['slippage_min'], CONFIG['slippage_max'])
            exec_px = open_px * (1 + slip)

            base_alloc = 1.0 / CONFIG['max_positions']
            alloc = base_alloc * vol_penalty
            alloc *= 1.0 / max(c['rel_vol'], 0.8)
            alloc = min(alloc, CONFIG['max_pos_size'])

            amt = total_val * alloc
            qty = int(amt / (exec_px * (1 + CONFIG['trade_cost'])))
            if qty <= 0:
                continue

            cost = qty * exec_px * (1 + CONFIG['trade_cost'])
            if cash < cost:
                continue

            cash -= cost
            holdings[t]['qty'] = qty
            holdings[t]['entry'] = exec_px
            holdings[t]['stop'] = exec_px - exec_px * c['raw_vol'] * CONFIG['atr_multiplier']
            active_positions += 1

    return pd.DataFrame(history)

In [None]:
# ==========================================
# 7. ENTRY POINT â€” single training pass
# ==========================================
print(f"Training: {CONFIG['start_date']} -> {CONFIG['train_end']}")
print(f"Testing:  {CONFIG['test_start']} -> {CONFIG['end_date']}")

# Step 1: Write & score news
write_news_jsonl(NEWS_DATA, NEWS_JSONL_PATH)
print("\n[1] Loading & scoring news with LLM...")
news_table = build_news_score_table(news_jsonl_path=NEWS_JSONL_PATH, tickers=TICKERS)

# Step 2: Process market data (once)
print("\n[2] Processing market data...")
data = process_data(TICKERS)
if not data:
    raise RuntimeError("No data returned from process_data.")

# Step 3: Train model (once)
print("\n[3] Training stacked model...")
system = StackedEnsemble()
system.train_stacking(data)

# Step 4: Run Monte-Carlo simulation(s)
seeds = [CONFIG['random_seed'] + i for i in range(CONFIG['runs'])]
results = []

for run_i, seed in enumerate(seeds):
    print(f"\n=== RUNS {run_i+1} / {len(seeds)} (seed={seed}) ===")
    rng = np.random.RandomState(seed)
    res = run_simulation(data, system, news_table, rng=rng,
                         initial_cash=100000.0, news_strength=0.25)
    if res.empty:
        print(f"Warning: Result empty for run {run_i}")
        continue
    res.set_index('Date', inplace=True)
    results.append(res)
    final_val = res['Total'].iloc[-1]
    pct_return = ((final_val / 100000.0) - 1) * 100.0
    max_dd = res['DD'].max() * 100.0
    print(f"Run {run_i+1} Final Capital: ${final_val:,.2f} | Return: {pct_return:.2f}% | Max DD: {max_dd:.2f}%")

# Step 5: Plot & summarise
if not results:
    print("No simulation results to plot.")
else:
    first = results[0]
    final_val = first['Total'].iloc[-1]
    ret = ((final_val / 100000.0) - 1) * 100
    max_dd = first['DD'].max() * 100

    print(f"\nFinal Capital: ${final_val:,.2f}")
    print(f"Return: {ret:.2f}%")
    print(f"Max DD: {max_dd:.2f}%")

    plt.figure(figsize=(12, 6))
    plt.plot(first.index, first['Total'], label='Log XLC', linewidth=2)
    plt.title("Log XLC On the Market")
    plt.legend()
    plt.tight_layout()
    plt.show()


    if len(results) > 1:
        finals = [r['Total'].iloc[-1] for r in results if not r.empty]
        plt.figure(figsize=(8, 4))
        plt.hist(finals, bins=min(30, len(finals)), alpha=0.9)
        plt.title("Distribution of Final Capitals")
        plt.xlabel("Final Capital ($)")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.show()

    for i, r in enumerate(results):
        fv = r['Total'].iloc[-1]
        rr = ((fv / 100000.0) - 1) * 100.0
        dd = r['DD'].max() * 100.0
        print(f"Run {i+1}: Final ${fv:,.2f} | Return {rr:.2f}% | MaxDD {dd:.2f}%")