In [None]:
import pandas as pd
import numpy as np
import pickle
import os

# Market‐odds normalization 
def moneyline_to_prob(ml: float) -> float:
    """Convert American moneyline into implied probability."""
    if ml < 0:
        return -ml / (-ml + 100)
    else:
        return 100 / (ml + 100)

def normalize_book_odds(p_book_raw: float) -> float:
    """Remove vig for a binary market (home vs away)."""
    p_raw   = moneyline_to_prob(p_book_raw)
    p_away  = 1 - p_raw
    total   = p_raw + p_away
    return p_raw / total

# BMA weights class
class BMAWeights:
    def __init__(self, num_models: int, alpha: float = 0.98,
                 prior: np.ndarray = None, eps: float = 1e-3):
        """
        num_models: number of forecasting signals
        alpha     : forgetting factor (0<alpha<=1)
        prior     : optional array of shape (num_models,) summing to 1
        eps       : stability floor on weights
        """
        self.alpha = alpha
        self.eps   = eps

        if prior is not None:
            assert len(prior) == num_models
            self.logw = np.log(prior)
        else:
            self.logw = np.log(np.ones(num_models) / num_models)

        self._renormalize()

    def _renormalize(self):
        # subtract max for numeric safety
        self.logw -= np.max(self.logw)
        w = np.exp(self.logw)
        w = np.clip(w / w.sum(), self.eps, 1 - self.eps)
        self.w = w / w.sum()

    def update(self, y_true: int, p_preds: np.ndarray) -> np.ndarray:
        """
        y_true   : observed outcome (1=home win, 0=away win)
        p_preds  : array of length num_models giving each model's P(y=1)
        """
        # likelihood of the observed outcome under each model
        lik = np.where(y_true==1, p_preds, 1 - p_preds)
        # Bayesian update in log‐space with forgetting
        self.logw = self.alpha * self.logw + np.log(lik + 1e-15)
        self._renormalize()
        return self.w

    def save(self, path: str):
        with open(path, 'wb') as f:
            pickle.dump({
                'logw': self.logw,
                'alpha': self.alpha,
                'eps': self.eps
            }, f)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as f:
            state = pickle.load(f)
        obj = cls(
            num_models=len(state['logw']),
            alpha=state['alpha'],
            prior=None,
            eps=state['eps']
        )
        obj.logw = state['logw']
        obj._renormalize()
        return obj

#Fair‐value engine
class FairValueEngine:
    def __init__(self,
                 num_models: int = 3,
                 alpha: float = 0.98,
                 state_path: str = "bma_state.pkl"):
        self.state_path = state_path
        # Try to load existing state, else start new BMA
        if os.path.exists(state_path):
            self.bma = BMAWeights.load(state_path)
        else:
            self.bma = BMAWeights(num_models=num_models, alpha=alpha)

    def update_from_history(self, history: list[tuple[int, list[float]]]):
        """
        history: list of (y_true, [p1, p2, p3,...]) for past events
        """
        for y_true, p_list in history:
            preds = np.array(p_list)
            # convert to likelihood of actual outcome
            lik = preds if y_true==1 else (1 - preds)
            self.bma.update(y_true, lik)
        # persist updated weights
        self.bma.save(self.state_path)

    def compute(self, signals_df: pd.DataFrame) -> pd.DataFrame:
        """
        signals_df must contain columns:
          - 'p_xgb'
          - 'p_book_raw'
          - 'p_polymarket'
        Returns a new DataFrame with added columns:
          - 'p_book'      (vig-removed)
          - 'fair_p'      (BMA probability)
          - 'fair_odds'
        """
        df = signals_df.copy()
        df['p_book'] = df['p_book_raw'].apply(normalize_book_odds)

        # assemble each row's model probs into array
        def _fv(row):
            ps = np.array([
                row['p_xgb'],
                row['p_book'],
                row['p_polymarket']
            ])
            return float(np.dot(self.bma.w, ps))

        df['fair_p']    = df.apply(_fv, axis=1)
        df['fair_odds'] = 1 / df['fair_p']
        return df

#Example usage
if __name__ == "__main__":
    # sample upcoming signals
    data = [
        {'game_id': '2025-09-07_BALvsPIT',
         'p_xgb': 0.55, 'p_book_raw': 2.20, 'p_polymarket': 0.50},
        {'game_id': '2025-09-07_NYGvsDAL',
         'p_xgb': 0.60, 'p_book_raw': -150, 'p_polymarket': 0.62},
    ]
    signals_df = pd.DataFrame(data)

    # sample history: (outcome, [p_xgb, p_book, p_poly])
    history = [
        (1, [0.55, normalize_book_odds(2.20), 0.50]),
        (0, [0.60, normalize_book_odds(-150), 0.62]),
        (1, [0.53, normalize_book_odds(2.30), 0.48]),
    ]

    engine = FairValueEngine(num_models=3, alpha=0.98, state_path="bma_state.pkl")
    engine.update_from_history(history)
    result_df = engine.compute(signals_df)
    print(result_df)


               game_id  p_xgb  p_book_raw  p_polymarket    p_book    fair_p  \
0  2025-09-07_BALvsPIT   0.55         2.2          0.50  0.978474  0.813116   
1  2025-09-07_NYGvsDAL   0.60      -150.0          0.62  0.600000  0.603373   

   fair_odds  
0   1.229837  
1   1.657350  


In [None]:
import os
os.environ.pop('MPLBACKEND', None)
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.calibration import calibration_curve
import xgboost as xgb
import matplotlib.pyplot as plt
import pickle

np.random.seed(42)
n_samples = 500

df = pd.DataFrame({
    'team_stat_diff': np.random.normal(0, 1, n_samples),
    'rest_day_diff': np.random.randint(-1, 3, n_samples),
    'home_field_advantage': np.random.binomial(1, 0.6, n_samples),
})
logit = 0.5 * df['team_stat_diff'] + 0.3 * df['rest_day_diff'] + 0.8 * df['home_field_advantage']
prob = 1 / (1 + np.exp(-logit)) #sigmoid
df['home_win'] = np.random.binomial(1, prob)

TARGET = 'home_win'
feature_cols = [c for c in df.columns if c != TARGET] #removes home_win for inputs
X = df[feature_cols] 
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)#20% saved for validation, rest training to prevent overfitting

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val,label=y_val)


params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'eta': 0.1, #learning rate
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1, #L2 regularization
    'seed': 42
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtrain, 'train'), (dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=False
)

y_pred_val = model.predict(dval)
print("Validation Log-loss:", log_loss(y_val, y_pred_val)) #ideally <0.5 for probability estimate accuracy but 0.6 ish not bad
print("ROC AUC:", roc_auc_score(y_val, y_pred_val))
print("Brier Score:", brier_score_loss(y_val, y_pred_val))

prob_true, prob_pred = calibration_curve(y_val, y_pred_val, n_bins=10)
plt.figure()
plt.plot(prob_pred, prob_true, marker='o', linewidth=1)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.title('Calibration Curve on Pseudo Data')
plt.savefig('calibration_curve.png')
with open('xgb_pseudo_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Pseudo-model saved to xgb_pseudo_model.pkl")

Validation Log-loss: 0.6169348241026441
ROC AUC: 0.6854838709677419
Brier Score: 0.2140811476917003
Pseudo-model saved to xgb_pseudo_model.pkl
