In [23]:
import pandas as pd, yfinance as yf
spy_1m = yf.download("SPY", period="7d", interval="1m", progress=False) \
            .tz_localize(None)
spy_1m.columns = spy_1m.columns.get_level_values(0)
spy_15 = (spy_1m
          .resample("15T")
          .agg({"Open":  "first",
                "High":  "max",
                "Low":   "min",
                "Close": "last",
                "Volume":"sum"}))
spy_15["ret"] = spy_15["Close"].pct_change()
spy_15["ma20"] = spy_15["Close"].rolling(20).mean()
spy_15["std20"] = spy_15["Close"].rolling(20).std()
spy_15.head(30)

  spy_1m = yf.download("SPY", period="7d", interval="1m", progress=False) \
  .resample("15T")
  spy_15["ret"] = spy_15["Close"].pct_change()


Price,Open,High,Low,Close,Volume,ret,ma20,std20
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-11 13:30:00,604.190002,604.25,602.799988,603.200012,7562647,,,
2025-06-11 13:45:00,603.190002,603.900024,602.25,603.650024,2775591,0.000746,,
2025-06-11 14:00:00,603.659973,604.159973,603.299988,604.14502,1751451,0.00082,,
2025-06-11 14:15:00,604.140015,604.88501,604.0,604.734985,1946343,0.000977,,
2025-06-11 14:30:00,604.73999,604.820007,604.445007,604.700012,1427962,-5.8e-05,,
2025-06-11 14:45:00,604.77002,605.059998,604.607971,604.700012,1486436,0.0,,
2025-06-11 15:00:00,604.900024,604.987915,604.099976,604.460083,2162695,-0.000397,,
2025-06-11 15:15:00,604.359985,604.73999,604.119995,604.219971,1493867,-0.000397,,
2025-06-11 15:30:00,604.210022,604.440002,603.679993,603.759888,1156467,-0.000761,,
2025-06-11 15:45:00,603.75,604.219971,603.22998,603.580017,2219050,-0.000298,,


In [None]:
import pandas as pd
import numpy as np

data = [
    {'game_id': '2025-09-07_BALvsPIT', 'p_xgb': 0.55, 'p_book': 0.52, 'p_polymarket': 0.50},
    {'game_id': '2025-09-07_NYGvsDAL', 'p_xgb': 0.60, 'p_book': 0.58, 'p_polymarket': 0.62},
]
signals_df = pd.DataFrame(data)


def normalize_book_odds(p_book_raw):
    p_book_home, p_book_away = p_book_raw, 1 - p_book_raw  
    total = p_book_home + p_book_away
    return p_book_home / total

class BMAWeights:
    def __init__(self, num_models, alpha=1.0, prior=None):
        self.alpha = alpha
        self.log_weights = np.log(prior) if prior is not None else np.log(np.ones(num_models) / num_models)
    
    def update(self, y_true, preds):
        self.log_weights = self.alpha * self.log_weights + np.log(preds)
        max_log = np.max(self.log_weights)
        self.log_weights -= max_log
        weights = np.exp(self.log_weights)
        self.weights = weights / weights.sum()
        return self.weights

def compute_fair_value(row, weights):
    model_probs = np.array([row['p_xgb'], row['p_book'], row['p_polymarket']])
    return np.dot(weights, model_probs)

bma = BMAWeights(num_models=3, alpha=0.98)

test = [
    (1, [0.55, 0.52, 0.50]),  
    (0, [0.60, 0.58, 0.62]),  
    (1, [0.53, 0.50, 0.48]),
]

for y_true, preds in test:
    model_preds = np.array(preds) if y_true == 1 else 1 - np.array(preds)
    weights = bma.update(y_true, model_preds)

signals_df['fair_p'] = signals_df.apply(lambda r: compute_fair_value(r, bma.weights), axis=1)
signals_df['fair_odds'] = 1 / signals_df['fair_p']

signals_df


Unnamed: 0,game_id,p_xgb,p_book,p_polymarket,fair_p,fair_odds
0,2025-09-07_BALvsPIT,0.55,0.52,0.5,0.525244,1.903879
1,2025-09-07_NYGvsDAL,0.6,0.58,0.62,0.598885,1.669768


In [3]:
import os
os.environ.pop('MPLBACKEND', None)
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.calibration import calibration_curve
import xgboost as xgb
import matplotlib.pyplot as plt
import pickle

np.random.seed(42)
n_samples = 500

df = pd.DataFrame({
    'team_stat_diff': np.random.normal(0, 1, n_samples),
    'rest_day_diff': np.random.randint(-1, 3, n_samples),
    'home_field_advantage': np.random.binomial(1, 0.6, n_samples),
})
logit = 0.5 * df['team_stat_diff'] + 0.3 * df['rest_day_diff'] + 0.8 * df['home_field_advantage']
prob = 1 / (1 + np.exp(-logit)) #sigmoid
df['home_win'] = np.random.binomial(1, prob)

TARGET = 'home_win'
feature_cols = [c for c in df.columns if c != TARGET] #removes home_win for inputs
X = df[feature_cols] 
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)#20% saved for validation, rest training to prevent overfitting

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val,label=y_val)


params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.1, #learning rate
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1, #L2 regularization
    'seed': 42
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=False
)

y_pred_val = model.predict(dval)
print("Validation Log-loss:", log_loss(y_val, y_pred_val)) #ideally <0.5 for probability estimate accuracy
print("ROC AUC:", roc_auc_score(y_val, y_pred_val))
print("Brier Score:", brier_score_loss(y_val, y_pred_val))

prob_true, prob_pred = calibration_curve(y_val, y_pred_val, n_bins=10)
plt.figure()
plt.plot(prob_pred, prob_true, marker='o', linewidth=1)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.title('Calibration Curve on Pseudo Data')
plt.savefig('calibration_curve.png')
with open('xgb_pseudo_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Pseudo-model saved to xgb_pseudo_model.pkl")

Validation Log-loss: 0.6169348241026441
ROC AUC: 0.6854838709677419
Brier Score: 0.2140811476917003
Pseudo-model saved to xgb_pseudo_model.pkl
