<a href="https://colab.research.google.com/github/alexzkhan07/CFB-Total-Score-Projection/blob/main/Data_mining_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, brier_score_loss, accuracy_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

class CFBBettingModel:
    def __init__(self):
        self.data = None
        self.features = None
        self.model = None
        self.calibrated_model = None

    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)
        self.data['total_points'] = self.data['home_points'] + self.data['away_points']
        self.data['went_over'] = (self.data['total_points'] > self.data['ou_line']).astype(int)
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data = self.data.sort_values('date').reset_index(drop=True)

    def create_features(self):
        feature_list = []
        for idx, game in self.data.iterrows():
            home_team, away_team, game_date = game['home_team'], game['away_team'], game['date']
            home_history = self.data[((self.data['home_team'] == home_team) | (self.data['away_team'] == home_team)) & (self.data['date'] < game_date)].tail(5)
            away_history = self.data[((self.data['home_team'] == away_team) | (self.data['away_team'] == away_team)) & (self.data['date'] < game_date)].tail(5)

            if len(home_history) < 2 or len(away_history) < 2: continue

            hp_for, hp_against, h_totals = [], [], []
            for _, hg in home_history.iterrows():
                pf, pa = (hg['home_points'], hg['away_points']) if hg['home_team'] == home_team else (hg['away_points'], hg['home_points'])
                hp_for.append(pf); hp_against.append(pa); h_totals.append(hg['total_points'])

            ap_for, ap_against, a_totals = [], [], []
            for _, ag in away_history.iterrows():
                pf, pa = (ag['home_points'], ag['away_points']) if ag['home_team'] == away_team else (ag['away_points'], ag['home_points'])
                ap_for.append(pf); ap_against.append(pa); a_totals.append(ag['total_points'])

            f_dict = {
                'ou_line': game['ou_line'], 'spread': float(game.get('spread', 0.0)),
                'home_avg_pts': np.mean(hp_for), 'home_avg_pts_allowed': np.mean(hp_against),
                'home_avg_total': np.mean(h_totals), 'home_std_total': np.std(h_totals),
                'away_avg_pts': np.mean(ap_for), 'away_avg_pts_allowed': np.mean(ap_against),
                'away_avg_total': np.mean(a_totals), 'away_std_total': np.std(a_totals),
                'combined_avg_total': (np.mean(h_totals) + np.mean(a_totals)) / 2,
                'went_over': game['went_over']
            }
            feature_list.append(f_dict)

        self.features = pd.DataFrame(feature_list)
        self.features['pace_factor'] = self.features['combined_avg_total'] - self.features['ou_line']

    def train(self):
        X = self.features.drop(columns=['went_over'])
        y = self.features['went_over']

        split_idx = int(len(X) * 0.8)
        X_train, X_test, y_train, y_test = X[:split_idx], X[split_idx:], y[:split_idx], y[split_idx:]

        self.model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42, use_label_encoder=False, eval_metric='logloss')
        self.model.fit(X_train, y_train)

        self.calibrated_model = CalibratedClassifierCV(self.model, method='isotonic', cv=3)
        self.calibrated_model.fit(X_train, y_train)

        y_pred_proba = self.calibrated_model.predict_proba(X_test)[:, 1]

        print("\n--- Model Performance ---")
        print(f"  AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
        print(f"  Brier Score: {brier_score_loss(y_test, y_pred_proba):.4f}")

        return X_test, y_test, y_pred_proba

    def evaluate_betting(self, X_test, y_test, y_pred_proba):
        for ev_thresh in [0.01, 0.03, 0.05, 0.07, 0.10]:
            profits, bets_placed = [], []
            bet_amount, odds_payout = 10, 1.909

            for i, prob_over in enumerate(y_pred_proba):
                ev_over = (prob_over * (bet_amount * (odds_payout - 1))) - ((1 - prob_over) * bet_amount)
                ev_under = ((1 - prob_over) * (bet_amount * (odds_payout - 1))) - (prob_over * bet_amount)

                profit, bet_made = 0, None
                if ev_over > (bet_amount * ev_thresh):
                    profit = bet_amount * (odds_payout - 1) if y_test.iloc[i] == 1 else -bet_amount
                    bet_made = 'Over'
                elif ev_under > (bet_amount * ev_thresh):
                    profit = bet_amount * (odds_payout - 1) if y_test.iloc[i] == 0 else -bet_amount
                    bet_made = 'Under'

                if bet_made:
                    profits.append(profit)
                    bets_placed.append(bet_made)

            print(f"\n--- Betting Results (EV > {ev_thresh*100:.0f}%) ---")
            if not bets_placed:
                print("  No bets met the threshold.")
                continue

            total_profit = sum(profits)
            roi = (total_profit / (len(bets_placed) * bet_amount)) * 100
            win_rate = (sum(1 for p in profits if p > 0) / len(profits)) * 100

            print(f"  Bets: {len(bets_placed)} | Win Rate: {win_rate:.2f}% | ROI: {roi:.2f}% | P/L: ${total_profit:.2f}")

    def plot(self, X_test, y_test, y_pred_proba):
        fig, axes = plt.subplots(2, 2, figsize=(14, 12))
        sns.histplot(y_pred_proba, bins=25, ax=axes[0, 0], kde=True).set_title('Probability Distribution')

        feat_imp = pd.DataFrame({'feature': X_test.columns, 'importance': self.model.feature_importances_}).sort_values('importance', ascending=False).head(10)
        sns.barplot(x='importance', y='feature', data=feat_imp, ax=axes[0, 1]).set_title('Top 10 Features')

        frac_pos, mean_pred = calibration_curve(y_test, y_pred_proba, n_bins=10)
        axes[1, 0].plot(mean_pred, frac_pos, marker='o', label='Model')
        axes[1, 0].plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect')
        axes[1, 0].set_title('Calibration Plot')

        axes[1, 1].axis('off')
        plt.tight_layout()
        plt.show()


DB = '***'

system = CFBBettingModel()
system.load_data(DB)
system.create_features()



FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/dataset.csv'