# Classification NBA Model

## Configuration

## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from nba_ou.data_preparation.missing_data.handle_missing_data import (
    apply_missing_policy,
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold, cross_validate, train_test_split
from xgboost import XGBClassifier


## Load Data

In [2]:
data_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/data/train_data/"
name = "all_odds_training_data_until_20260110.csv"

path = data_path + name

df_stats = pd.read_csv(path)

dtype_dict = {col: str for col in df_stats.columns if "ID" in col.upper()}

df_stats = pd.read_csv(
    path,
    dtype=dtype_dict
)
df_stats['GAME_DATE'] = pd.to_datetime(df_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')

  df_stats = pd.read_csv(path)
  df_stats = pd.read_csv(


In [3]:
# #print column names with name in lower case
# for col in df_stats.columns:
#     if not col.isupper():
#         print(col)

In [4]:
from nba_ou.data_preparation.missing_data.clean_df_for_training import (
    clean_dataframe_for_training
)
df_to_train = clean_dataframe_for_training(df_stats, nan_threshold=4, drop_all_na_rows=True, verbose=1)


STARTING DATAFRAME CLEANING PIPELINE
Starting basic cleaning with 10794 rows
Basic cleaning complete: 7976 rows remaining

Starting advanced column cleaning with 1133 columns

Advanced column cleaning complete: 1133 → 660 columns (473 removed)


Applying missing data policy...

Missing Data Policy Report:
  Rows dropped: 0 (0.0%)
  Critical columns requiring data: 5
  Columns zero-filled: 132
  Infer pairs applied: 54/228
  Remaining NaN cells: 144

Dropping rows that are all NaN...
CLEANING COMPLETE
Final shape: (7957, 660)


In [5]:
# Count NAs per column
na_counts = df_to_train.isna().sum()

# Get most common SEASON_YEAR for nulls in each column
most_common_season = []
for col in df_to_train.columns:
    if na_counts[col] > 0:
        # Get rows where this column is null
        null_rows = df_stats[df_stats[col].isna()]
        if len(null_rows) > 0 and 'SEASON_YEAR' in df_stats.columns:
            # Find most common SEASON_YEAR for these null rows
            common_season = null_rows['SEASON_YEAR'].mode()
            most_common_season.append(common_season.iloc[0] if len(common_season) > 0 else None)
        else:
            most_common_season.append(None)
    else:
        most_common_season.append(None)

na_counts_df = pd.DataFrame({
    'Column': na_counts.index,
    'NA_Count': na_counts.values,
    'NA_Percentage': (na_counts.values / len(df_to_train) * 100).round(2),
    'Most_Common_Season_Year': most_common_season
}).sort_values('NA_Count', ascending=False)

# Show only columns with NAs
na_counts_df[na_counts_df['NA_Count'] > 0]

Unnamed: 0,Column,NA_Count,NA_Percentage,Most_Common_Season_Year


## Train / Test

In [None]:
X = df_to_train.drop(['TOTAL_POINTS', 'SEASON_YEAR'], axis=1, errors='ignore')
y = df_to_train['TOTAL_POINTS']

In [14]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

In [15]:
df_to_train['IS_TRAINING_DATA'] = False

# Mark True for the rows in the training set
df_to_train.loc[X_train.index, 'IS_TRAINING_DATA'] = True
# output_name = f"{data_path}/training_data_with_missing_data_handled_from_2004-10-01_to_2026-01-10_classifier.csv"
# df_to_train.to_csv(output_name, index=False)

In [16]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Check number of coulmns
print(f"Number of columns in training set: {X_train.shape[1]}")
print(f"Number of columns in test set: {X_test.shape[1]}")

Training set size: 5896
Test set size: 1966
Number of columns in training set: 658
Number of columns in test set: 658


## Cross-validation

In [17]:
from sklearn.model_selection import KFold, cross_validate, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, root_mean_squared_error

In [18]:
# Declare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=16)

In [19]:
# Custom scorer for over/under betting accuracy
def over_under_betting_accuracy(y_true, y_pred, betting_line):
    """
    Calculate the accuracy of over/under betting decisions.
    
    A bet is successful if:
    - We predict OVER (pred > line) and actual is OVER (true > line), OR
    - We predict UNDER (pred < line) and actual is UNDER (true < line)
    
    Parameters:
    -----------
    y_true : array-like
        Actual total points
    y_pred : array-like
        Predicted total points
    betting_line : array-like
        Over/under betting line
    
    Returns:
    --------
    float : Accuracy of betting decisions (0 to 1)
    """
    # Calculate differences from the betting line
    pred_diff = y_pred - betting_line
    true_diff = y_true - betting_line
    
    # Check if both have the same sign (both positive or both negative)
    # np.sign returns -1, 0, or 1
    correct_predictions = np.sign(pred_diff) == np.sign(true_diff)
    
    # Calculate accuracy
    accuracy = np.mean(correct_predictions)
    
    return accuracy


# Custom scorer class to work with sklearn
class OverUnderScorer:
    """
    Custom scorer that calculates betting accuracy for over/under predictions.
    """
    def __call__(self, estimator, X, y_true):
        """
        Calculate the over/under betting accuracy.
        
        Parameters:
        -----------
        estimator : fitted estimator
            The model to evaluate
        X : DataFrame
            Features including 'TOTAL_OVER_UNDER_LINE'
        y_true : array-like
            Actual total points
        
        Returns:
        --------
        float : Betting accuracy score
        """
        y_pred = estimator.predict(X)
        betting_line = X['TOTAL_OVER_UNDER_LINE'].values
        return over_under_betting_accuracy(y_true, y_pred, betting_line)


# Create a scorer object
over_under_scorer = OverUnderScorer()

In [41]:
# Declare scores to be used
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'RMSE': make_scorer(root_mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
    'OU_Betting_Accuracy': over_under_scorer,
}

In [42]:
def print_metrics(cv_results):
    for sc in scoring.keys():
        if sc == 'OU_Betting_Accuracy':
            print(f'Train {sc}:', f"{cv_results[f'train_{sc}'].mean():.2%}")
            print(f'Validation {sc}:', f"{cv_results[f'test_{sc}'].mean():.2%}")
        else:
            print(f'Train {sc}:', cv_results[f'train_{sc}'].mean().round(5))
            print(f'Validation {sc}:', cv_results[f'test_{sc}'].mean().round(5))
        print()

In [43]:
def real_vs_pred(model, X_train, y_train):
    preds = cross_val_predict(model, X_train, y_train, cv=kf, n_jobs=-1)
    x_line = np.arange(y_train.min(), y_train.max())
    plt.scatter(y_train, preds)
    plt.plot(x_line, x_line, color='orange')
    plt.xlabel('Real target')
    plt.ylabel('Predicted target')
    plt.show()

## Baseline

In [44]:
from sklearn.dummy import DummyRegressor

season_bl = DummyRegressor(strategy='mean')
cv_results = cross_validate(season_bl, X_train, y_train, cv=kf,
                            scoring=scoring, return_train_score=True)
season_bl.fit(X_train, y_train)
print_metrics(cv_results)

Train MSE: 382.54941
Validation MSE: 382.92416

Train RMSE: 19.5588
Validation RMSE: 19.56739

Train MAE: 15.5792
Validation MAE: 15.58577

Train OU_Betting_Accuracy: 49.56%
Validation OU_Betting_Accuracy: 49.57%



In [45]:
# Baseline 3: Predict the betting line (TOTAL_OVER_UNDER_LINE)
y_pred_baseline_3 = X_train['TOTAL_OVER_UNDER_LINE']

# Evaluate
mse = mean_squared_error(y_train, y_pred_baseline_3)
mae = mean_absolute_error(y_train, y_pred_baseline_3)
rmse = root_mean_squared_error(y_train, y_pred_baseline_3)
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

MSE: 293.36
RMSE: 17.13
MAE: 13.65


## Logistic Regression

In [46]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
cv_results = cross_validate(lr, X_train.fillna(0), y_train, cv=kf,
                            scoring=scoring, return_train_score=True)

print_metrics(cv_results)

Train MSE: 241.19421
Validation MSE: 322.52042

Train RMSE: 15.53039
Validation RMSE: 17.95828

Train MAE: 12.27892
Validation MAE: 14.22338

Train OU_Betting_Accuracy: 62.20%
Validation OU_Betting_Accuracy: 51.13%



In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from xgboost import XGBRegressor

# Example XGBoost regressor:
xgb_reg = XGBRegressor(
    max_depth=3,
    learning_rate=0.02,
    n_estimators=1000,
    subsample=0.5,       # Equivalent to max_samples in GBRegressor
    colsample_bytree=0.6, # Equivalent to max_features in GBRegressor
    n_jobs=-2,
    random_state=16)

cv_results = cross_validate(
    xgb_reg, 
    X_train, y_train, 
    cv=kf, 
    scoring=scoring,      # Use your custom scoring or e.g. 'neg_mean_absolute_error'
    return_train_score=True,
    n_jobs=-2
)
# Train final model on full train set
xgb_reg.fit(X_train, y_train)

# Print metrics
print_metrics(cv_results)


Train MSE: 159.04377
Validation MSE: 287.48163

Train RMSE: 12.61123
Validation RMSE: 16.9545

Train MAE: 10.0342
Validation MAE: 13.44005

Train OU_Betting_Accuracy: 80.07%
Validation OU_Betting_Accuracy: 53.31%



In [48]:
def ou_accuracy(y_true, y_pred, line):
    """
    Same sign of (pred-line) and (true-line).
    Note: pushes are counted as correct only if both are exactly 0 (rare with .5 lines).
    """
    pred_diff = y_pred - line
    true_diff = y_true - line
    return np.mean(np.sign(pred_diff) == np.sign(true_diff))

def evaluate_ou_thresholds(
    model,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    line_col: str = "TOTAL_OVER_UNDER_LINE",
    thresholds = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
):
    # Predict
    y_pred = model.predict(X_test)

    # Line + margins
    line = X_test[line_col].to_numpy()
    margin = np.abs(y_pred - line)

    rows = []
    n_total = len(y_test)

    # Full test set first (threshold 0 means "all")
    for t in thresholds:
        mask = margin > t
        n = int(mask.sum())
        if n == 0:
            acc = np.nan
        else:
            acc = ou_accuracy(y_test.to_numpy()[mask], y_pred[mask], line[mask])

        rows.append({
            "threshold_abs_pred_minus_line_gt": t,
            "n_games": n,
            "pct_of_test": (n / n_total) if n_total else np.nan,
            "ou_betting_accuracy": acc
        })

    return pd.DataFrame(rows), y_pred



results_df, y_pred_test = evaluate_ou_thresholds(
    model=xgb_reg,
    X_test=X_test,
    y_test=y_test,
    thresholds=range(0, 11)  # 0..10
)

# Pretty display
display(results_df.style.format({
    "pct_of_test": "{:.1%}",
    "ou_betting_accuracy": "{:.2%}",
}))


Unnamed: 0,threshold_abs_pred_minus_line_gt,n_games,pct_of_test,ou_betting_accuracy
0,0,1990,100.0%,54.07%
1,1,1544,77.6%,54.66%
2,2,1147,57.6%,55.36%
3,3,803,40.4%,56.66%
4,4,545,27.4%,60.73%
5,5,367,18.4%,63.76%
6,6,233,11.7%,67.81%
7,7,149,7.5%,72.48%
8,8,104,5.2%,77.88%
9,9,77,3.9%,79.22%


In [49]:
def add_ou_correctness_columns(
    model,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    line_col: str = "TOTAL_OVER_UNDER_LINE",
) -> pd.DataFrame:
    df = X_test.copy()

    y_pred = model.predict(X_test)
    line = df[line_col].to_numpy()
    y_true = y_test.to_numpy()

    pred_diff = y_pred - line
    true_diff = y_true - line

    df["Y_TRUE"] = y_true
    df["Y_PRED"] = y_pred
    df["LINE"] = line
    df["PRED_DIFF"] = pred_diff
    df["TRUE_DIFF"] = true_diff
    df["MARGIN_ABS"] = np.abs(pred_diff)

    # Same sign -> correct
    df["OU_CORRECT"] = (np.sign(pred_diff) == np.sign(true_diff)).astype(int)

    return df

def ou_accuracy_by_season(
    model,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    df_to_train: pd.DataFrame,
    line_col: str = "TOTAL_OVER_UNDER_LINE",
    threshold_abs: float = 0.0,
) -> pd.DataFrame:
    df = add_ou_correctness_columns(model, X_test, y_test, line_col=line_col)

    # Bring SEASON_YEAR from your original dataframe (same pattern you used)
    df = df.merge(df_to_train[["SEASON_YEAR"]], left_index=True, right_index=True, how="left")

    # Apply threshold selection (margin > threshold)
    mask = df["MARGIN_ABS"] > threshold_abs
    df_sel = df.loc[mask].copy()

    season_stats = (
        df_sel.groupby("SEASON_YEAR")
        .agg(
            n_games=("OU_CORRECT", "size"),
            ou_betting_accuracy=("OU_CORRECT", "mean"),
        )
        .reset_index()
        .sort_values("SEASON_YEAR")
    )

    season_stats["threshold_abs_pred_minus_line_gt"] = threshold_abs
    return season_stats

# Example: threshold 0 (your current approach selects all rows with margin > 0)
season_acc = ou_accuracy_by_season(
    model=xgb_reg,
    X_test=X_test,
    y_test=y_test,
    df_to_train=df_to_train,
    threshold_abs=0.0,
)

print(season_acc)


   SEASON_YEAR  n_games  ou_betting_accuracy  threshold_abs_pred_minus_line_gt
0         2018        4             0.250000                               0.0
1         2019      225             0.484444                               0.0
2         2020      292             0.500000                               0.0
3         2021      352             0.596591                               0.0
4         2022      312             0.557692                               0.0
5         2023      297             0.558923                               0.0
6         2024      363             0.490358                               0.0
7         2025      145             0.634483                               0.0


In [25]:
import numpy as np
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

LINE_COL = "TOTAL_OVER_UNDER_LINE"

# Keep your KFold
kf = KFold(n_splits=8, shuffle=True, random_state=16)

def ou_accuracy(y_true, y_pred, line):
    pred_diff = y_pred - line
    true_diff = y_true - line
    # pushes: sign(0)=0. With .5 lines it is rare anyway.
    return float(np.mean(np.sign(pred_diff) == np.sign(true_diff)))

def ou_accuracy_with_threshold(y_true, y_pred, line, threshold_abs=0.0, min_coverage=0.25):
    margin = np.abs(y_pred - line)
    mask = margin > threshold_abs
    coverage = float(np.mean(mask))
    if coverage < min_coverage:
        # Hard penalty if the strategy barely bets
        return 0.0, coverage
    if mask.sum() == 0:
        return 0.0, coverage
    return ou_accuracy(y_true[mask], y_pred[mask], line[mask]), coverage

def _predict_best(model, X):
    # Use best iteration if early stopping happened
    if getattr(model, "best_iteration", None) is not None:
        # Newer XGBoost
        try:
            return model.predict(X, iteration_range=(0, model.best_iteration + 1))
        except TypeError:
            # Older compatibility path
            ntree_limit = getattr(model, "best_ntree_limit", None)
            if ntree_limit is not None:
                return model.predict(X, ntree_limit=ntree_limit)
    return model.predict(X)

def objective(trial, X, y):
    threshold_abs = trial.suggest_float("threshold_abs", 0.0, 10.0, step=0.5)
    min_coverage = 0.25

    params = {
        "booster": "gbtree",
        "tree_method": "hist",
        "max_depth": trial.suggest_int("max_depth", 2, 7),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 30.0, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.0075, 0.2, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 50.0, log=True),

        "n_estimators": trial.suggest_int("n_estimators", 300, 2500, step=100),
        "early_stopping_rounds": 200,   # moved here
        "eval_metric": "rmse",          # optional, but explicit is good
        "random_state": 16,
        "n_jobs": -1,
    }

    fold_scores = []
    fold_coverages = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr = y.iloc[tr_idx].to_numpy()
        y_va = y.iloc[va_idx].to_numpy()

        model = XGBRegressor(**params)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False,
        )

        y_pred = _predict_best(model, X_va)
        line = X_va[LINE_COL].to_numpy()

        score, coverage = ou_accuracy_with_threshold(
            y_true=y_va,
            y_pred=y_pred,
            line=line,
            threshold_abs=threshold_abs,
            min_coverage=min_coverage,
        )

        fold_scores.append(score)
        fold_coverages.append(coverage)

        trial.report(float(np.mean(fold_scores)), step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    trial.set_user_attr("mean_coverage", float(np.mean(fold_coverages)))
    return float(np.mean(fold_scores))

# ----------------------------
# Run the search (2 to 3 hours)
# ----------------------------
# Make sure X_train includes TOTAL_OVER_UNDER_LINE, since you use it in the metric.
# X_train, y_train are your current train split.
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=16),
    pruner=MedianPruner(n_warmup_steps=2),
)

# Time budget: 3 hours. You can adjust to 2*3600 if you want.
study.optimize(lambda t: objective(t, X_train, y_train), timeout=4 * 3600, n_jobs=1)

print("Best value (CV OU success):", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(k, v)

# ----------------------------
# Train final model on full train set
# ----------------------------
best_params = study.best_params.copy()
best_threshold = best_params.pop("threshold_abs")  # strategy threshold

final_params = {
    "booster": "gbtree",
    "tree_method": "hist",
    "random_state": 16,
    "n_jobs": -1,
    **best_params,
}

final_model = XGBRegressor(**final_params)

[32m[I 2026-02-05 23:46:28,844][0m A new study created in memory with name: no-name-4e7a048f-0ed4-4989-9506-0ddde34ecef3[0m
[32m[I 2026-02-05 23:48:13,472][0m Trial 0 finished with value: 0.5623390331428632 and parameters: {'threshold_abs': 2.0, 'max_depth': 5, 'min_child_weight': 0.8219695696031055, 'gamma': 0.22800975066403162, 'subsample': 0.6803644176738863, 'colsample_bytree': 0.6115404708456444, 'learning_rate': 0.071971944311379, 'reg_alpha': 2.975656697106555e-07, 'reg_lambda': 3.478796625225186e-06, 'n_estimators': 2400}. Best is trial 0 with value: 0.5623390331428632.[0m
[32m[I 2026-02-05 23:49:13,815][0m Trial 1 finished with value: 0.0 and parameters: {'threshold_abs': 5.5, 'max_depth': 2, 'min_child_weight': 3.2561820715318097, 'gamma': 0.7922608676158321, 'subsample': 0.6251406533720116, 'colsample_bytree': 0.6467436280510988, 'learning_rate': 0.07385949950651723, 'reg_alpha': 0.0001507914760359121, 'reg_lambda': 4.526442350789216e-05, 'n_estimators': 1300}. Best 

Best value (CV OU success): 0.6008348976729305
Best params:
threshold_abs 3.0
max_depth 5
min_child_weight 6.764645919375405
gamma 0.5555438491302327
subsample 0.8997434347491664
colsample_bytree 0.8777753673030719
learning_rate 0.008198390348212792
reg_alpha 1.5541242347086836e-06
reg_lambda 0.00014480660336286608
n_estimators 2200


In [26]:
import joblib
final_model.fit(X_train, y_train, verbose=False)
final_model.save_model("xgb_ou_model.json")   # or .ubj in newer versions


bundle = {
    "model": final_model,
    "threshold_abs": best_threshold,
    "feature_names": list(X_train.columns),
    "line_col": LINE_COL,
}
joblib.dump(bundle, "ou_xgb_bundle.joblib")


['ou_xgb_bundle.joblib']

## Check in Test set

In [27]:
results_df_final, y_pred_test = evaluate_ou_thresholds(
    model=final_model,
    X_test=X_test,
    y_test=y_test,
    thresholds=range(0, 11)  # 0..10
)

# Pretty display
display(results_df_final.style.format({
    "pct_of_test": "{:.1%}",
    "ou_betting_accuracy": "{:.2%}",
}))


Unnamed: 0,threshold_abs_pred_minus_line_gt,n_games,pct_of_test,ou_betting_accuracy
0,0,1994,100.0%,52.51%
1,1,1589,79.7%,53.43%
2,2,1217,61.0%,53.41%
3,3,895,44.9%,55.08%
4,4,660,33.1%,56.52%
5,5,465,23.3%,57.42%
6,6,307,15.4%,59.93%
7,7,209,10.5%,62.68%
8,8,146,7.3%,65.75%
9,9,111,5.6%,68.47%


In [30]:
feature_importances = final_model.feature_importances_
important_features = np.argsort(feature_importances)[::-1]  
feature_importances = pd.DataFrame({
    'Feature': X_train.columns[important_features],
    'Importance': feature_importances[important_features]
}).sort_values(by="Importance", ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
0,total_draftkings_line_over,0.042220
1,total_fanduel_line_over,0.020681
2,odds_total_line_books_mean,0.019866
3,TOTAL_OVER_UNDER_LINE,0.005470
4,IMPLIED_PTS_HOME,0.003097
...,...,...
686,spread_betmgm_price_away__is_missing,0.000000
687,spread_betmgm_line_home__is_missing,0.000000
688,spread_betmgm_price_home__is_missing,0.000000
737,total_fanduel_line_over__is_missing,0.000000
