# Classification NBA Model

## Configuration

## Imports

In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from nba_ou.data_preparation.missing_data.handle_missing_data import (
    apply_missing_policy,
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold, cross_validate, train_test_split
from xgboost import XGBClassifier


## Load Data

In [26]:
data_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/data/train_data/"
name = "all_odds_training_data_until_20260110.csv"

path = data_path + name

df_stats = pd.read_csv(path)

dtype_dict = {col: str for col in df_stats.columns if "ID" in col.upper()}

df_stats = pd.read_csv(
    path,
    dtype=dtype_dict
)
df_stats['GAME_DATE'] = pd.to_datetime(df_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')

  df_stats = pd.read_csv(path)
  df_stats = pd.read_csv(


In [27]:
# #print column names with name in lower case
# for col in df_stats.columns:
#     if not col.isupper():
#         print(col)

In [28]:
from nba_ou.data_preparation.missing_data.clean_df_for_training import (
    clean_dataframe_for_training
)
df_to_train = clean_dataframe_for_training(df_stats, nan_threshold=4, drop_all_na_rows=True, verbose=1)


STARTING DATAFRAME CLEANING PIPELINE
Starting basic cleaning with 10794 rows
Basic cleaning complete: 7976 rows remaining

Starting advanced column cleaning with 1133 columns

Advanced column cleaning complete: 1133 → 660 columns (473 removed)


Applying missing data policy...

Missing Data Policy Report:
  Rows dropped: 0 (0.0%)
  Critical columns requiring data: 5
  Columns zero-filled: 132
  Infer pairs applied: 54/228
  Remaining NaN cells: 144

Dropping rows that are all NaN...
CLEANING COMPLETE
Final shape: (7957, 660)


In [29]:
# Count NAs per column
na_counts = df_to_train.isna().sum()

# Get most common SEASON_YEAR for nulls in each column
most_common_season = []
for col in df_to_train.columns:
    if na_counts[col] > 0:
        # Get rows where this column is null
        null_rows = df_stats[df_stats[col].isna()]
        if len(null_rows) > 0 and 'SEASON_YEAR' in df_stats.columns:
            # Find most common SEASON_YEAR for these null rows
            common_season = null_rows['SEASON_YEAR'].mode()
            most_common_season.append(common_season.iloc[0] if len(common_season) > 0 else None)
        else:
            most_common_season.append(None)
    else:
        most_common_season.append(None)

na_counts_df = pd.DataFrame({
    'Column': na_counts.index,
    'NA_Count': na_counts.values,
    'NA_Percentage': (na_counts.values / len(df_to_train) * 100).round(2),
    'Most_Common_Season_Year': most_common_season
}).sort_values('NA_Count', ascending=False)

# Show only columns with NAs
na_counts_df[na_counts_df['NA_Count'] > 0]

Unnamed: 0,Column,NA_Count,NA_Percentage,Most_Common_Season_Year


In [30]:
df_to_train['LINE_ERROR'] = df_to_train['TOTAL_POINTS'] - df_to_train['TOTAL_OVER_UNDER_LINE']

In [31]:
df_to_train = df_to_train[df_to_train['LINE_ERROR'] != 0]

## Train / Test

In [32]:
X = df_to_train.drop(['TOTAL_POINTS', 'LINE_ERROR', 'SEASON_YEAR'], axis=1, errors='ignore')
y = df_to_train['LINE_ERROR']

In [33]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

In [34]:
df_to_train['IS_TRAINING_DATA'] = False

# Mark True for the rows in the training set
df_to_train.loc[X_train.index, 'IS_TRAINING_DATA'] = True
# output_name = f"{data_path}/training_data_with_missing_data_handled_from_2004-10-01_to_2026-01-10_classifier.csv"
# df_to_train.to_csv(output_name, index=False)

In [35]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Check number of coulmns
print(f"Number of columns in training set: {X_train.shape[1]}")
print(f"Number of columns in test set: {X_test.shape[1]}")

Training set size: 5896
Test set size: 1966
Number of columns in training set: 658
Number of columns in test set: 658


## Cross-validation

In [36]:
from sklearn.model_selection import KFold, cross_validate, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, root_mean_squared_error

In [37]:
# Declare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=16)

In [43]:
import numpy as np


def over_under_betting_accuracy(true_error: np.ndarray, pred_error: np.ndarray) -> float:
    true_sign = np.sign(true_error)
    pred_sign = np.sign(pred_error)

    valid = (true_sign != 0) & (pred_sign != 0)
    if not np.any(valid):
        return np.nan

    return float(np.mean(true_sign[valid] == pred_sign[valid]))

class OverUnderScorer:
    """
    Custom sklearn-compatible scorer for over/under betting accuracy.
    """

    def __call__(self, estimator, X, y_true):
        y_pred = estimator.predict(X)

        # betting_line = X["TOTAL_OVER_UNDER_LINE"].to_numpy(dtype=float)

        return over_under_betting_accuracy(
            true_error=np.asarray(y_true, dtype=float),
            pred_error=np.asarray(y_pred, dtype=float),
            # betting_line=betting_line,
        )
over_under_scorer = OverUnderScorer()


In [44]:
# Declare scores to be used
scoring = {
    'MSE': make_scorer(mean_squared_error),
    'RMSE': make_scorer(root_mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
    'OU_Betting_Accuracy': over_under_scorer,
}

In [45]:
def print_metrics(cv_results):
    for sc in scoring.keys():
        if sc == 'OU_Betting_Accuracy':
            print(f'Train {sc}:', f"{cv_results[f'train_{sc}'].mean():.2%}")
            print(f'Validation {sc}:', f"{cv_results[f'test_{sc}'].mean():.2%}")
        else:
            print(f'Train {sc}:', cv_results[f'train_{sc}'].mean().round(5))
            print(f'Validation {sc}:', cv_results[f'test_{sc}'].mean().round(5))
        print()

In [46]:
def real_vs_pred(model, X_train, y_train):
    preds = cross_val_predict(model, X_train, y_train, cv=kf, n_jobs=-1)
    x_line = np.arange(y_train.min(), y_train.max())
    plt.scatter(y_train, preds)
    plt.plot(x_line, x_line, color='orange')
    plt.xlabel('Real target')
    plt.ylabel('Predicted target')
    plt.show()

## Baseline

In [47]:
from sklearn.dummy import DummyRegressor

season_bl = DummyRegressor(strategy='mean')
cv_results = cross_validate(season_bl, X_train, y_train, cv=kf,
                            scoring=scoring, return_train_score=True)
season_bl.fit(X_train, y_train)
print_metrics(cv_results)

Train MSE: 300.70843
Validation MSE: 300.82604

Train RMSE: 17.34076
Validation RMSE: 17.34124

Train MAE: 13.90963
Validation MAE: 13.91101

Train OU_Betting_Accuracy: 51.31%
Validation OU_Betting_Accuracy: 51.31%



In [48]:
# Baseline 3: Predict the betting line (TOTAL_OVER_UNDER_LINE)
y_pred_baseline_3 = X_train['TOTAL_OVER_UNDER_LINE']

# Evaluate
mse = mean_squared_error(y_train, y_pred_baseline_3)
mae = mean_absolute_error(y_train, y_pred_baseline_3)
rmse = root_mean_squared_error(y_train, y_pred_baseline_3)
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

MSE: 51527.06
RMSE: 227.00
MAE: 226.14


## Logistic Regression

In [49]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
cv_results = cross_validate(lr, X_train.fillna(0), y_train, cv=kf,
                            scoring=scoring, return_train_score=True)

print_metrics(cv_results)

Train MSE: 248.04488
Validation MSE: 332.61069

Train RMSE: 15.74924
Validation RMSE: 18.2338

Train MAE: 12.49327
Validation MAE: 14.46717

Train OU_Betting_Accuracy: 62.98%
Validation OU_Betting_Accuracy: 51.98%



In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from xgboost import XGBRegressor

# Example XGBoost regressor:
xgb_reg = XGBRegressor(
    max_depth=3,
    learning_rate=0.02,
    n_estimators=500,
    subsample=0.5,       # Equivalent to max_samples in GBRegressor
    colsample_bytree=0.6, # Equivalent to max_features in GBRegressor
    n_jobs=-2,
    random_state=16)

cv_results = cross_validate(
    xgb_reg, 
    X_train, y_train, 
    cv=kf, 
    scoring=scoring,      # Use your custom scoring or e.g. 'neg_mean_absolute_error'
    return_train_score=True,
    n_jobs=-2
)
# Train final model on full train set
xgb_reg.fit(X_train, y_train)

# Print metrics
print_metrics(cv_results)


Train MSE: 215.46693
Validation MSE: 291.79678

Train RMSE: 14.67873
Validation RMSE: 17.08077

Train MAE: 11.72612
Validation MAE: 13.62263

Train OU_Betting_Accuracy: 75.95%
Validation OU_Betting_Accuracy: 53.27%



In [57]:
feature_importances = xgb_reg.feature_importances_
important_features = np.argsort(feature_importances)[::-1]  
feature_importances = pd.DataFrame({
    'Feature': X_train.columns[important_features],
    'Importance': feature_importances[important_features]
}).sort_values(by="Importance", ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
0,total_fanduel_line_over,0.003397
1,spread_draftkings_price_away,0.003245
2,odds_total_line_move_draftkings,0.003147
3,odds_total_line_move_fanduel,0.002996
4,OFF_RATING_LAST_ALL_10_MATCHES_BEFORE_TEAM_AWAY,0.002992
...,...,...
626,WINS_BEFORE_THIS_GAME_TEAM_AWAY,0.000000
638,TOTAL_OVER_UNDER_LINE_LAST_ALL_10_MATCHES_BEFO...,0.000000
639,IS_OVER_LINE_LAST_ALL_10_MATCHES_BEFORE_TEAM_AWAY,0.000000
656,odds_ml_away_prob_novig_draftkings,0.000000


In [64]:
n_features = X_train.shape[1]
important_features = feature_importances['Feature'].tolist()[:int(n_features*0.9)]

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from xgboost import XGBRegressor

# Example XGBoost regressor:
xgb_reg_impt = XGBRegressor(
    max_depth=3,
    learning_rate=0.02,
    n_estimators=500,
    subsample=0.5,       # Equivalent to max_samples in GBRegressor
    colsample_bytree=0.6, # Equivalent to max_features in GBRegressor
    n_jobs=-2,
    random_state=16)

cv_results = cross_validate(
    xgb_reg_impt, 
    X_train[important_features], y_train, 
    cv=kf, 
    scoring=scoring,      # Use your custom scoring or e.g. 'neg_mean_absolute_error'
    return_train_score=True,
    n_jobs=-2
)
# Train final model on full train set
xgb_reg_impt.fit(X_train[important_features], y_train)

# Print metrics
print_metrics(cv_results)


Train MSE: 215.8605
Validation MSE: 291.58093

Train RMSE: 14.6921
Validation RMSE: 17.0744

Train MAE: 11.74219
Validation MAE: 13.60972

Train OU_Betting_Accuracy: 75.63%
Validation OU_Betting_Accuracy: 53.02%



In [66]:
import numpy as np
import pandas as pd


def error_sign_accuracy(y_true_error, y_pred_error) -> float:
    """
    Same sign of (predicted error) and (true error).

    Push handling:
    - If either sign is 0, exclude that sample from scoring.
      (If you prefer to count (0,0) as correct, I can change it.)
    """
    y_true_error = np.asarray(y_true_error, dtype=float)
    y_pred_error = np.asarray(y_pred_error, dtype=float)

    true_sign = np.sign(y_true_error)
    pred_sign = np.sign(y_pred_error)

    valid = (true_sign != 0) & (pred_sign != 0)
    if not np.any(valid):
        return np.nan

    return float(np.mean(true_sign[valid] == pred_sign[valid]))


def evaluate_error_thresholds(
    model,
    X_test: pd.DataFrame,
    y_test_error: pd.Series,
    thresholds=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
):
    """
    Evaluate directional accuracy at different confidence thresholds.

    Threshold rule:
    - Include a game if abs(predicted_error) > t
    """
    # Predict error directly
    y_pred_error = np.asarray(model.predict(X_test), dtype=float)

    margin = np.abs(y_pred_error)

    rows = []
    n_total = len(y_test_error)

    y_true_error_np = y_test_error.to_numpy(dtype=float)

    for t in thresholds:
        mask = margin > t
        n = int(mask.sum())

        acc = (
            np.nan
            if n == 0
            else error_sign_accuracy(y_true_error_np[mask], y_pred_error[mask])
        )

        rows.append(
            {
                "threshold_abs_pred_error_gt": t,
                "n_games": n,
                "pct_of_test": (n / n_total) if n_total else np.nan,
                "directional_accuracy": acc,
            }
        )

    return pd.DataFrame(rows), y_pred_error
results_df, y_pred_test_error = evaluate_error_thresholds(
    model=xgb_reg,
    X_test=X_test,
    y_test_error=y_test,      # y_test must be the REAL ERROR series
    thresholds=range(0, 11),
)

display(
    results_df.style.format(
        {"pct_of_test": "{:.1%}", "directional_accuracy": "{:.2%}"}
    )
)


Unnamed: 0,threshold_abs_pred_error_gt,n_games,pct_of_test,directional_accuracy
0,0,1966,100.0%,54.02%
1,1,1316,66.9%,56.99%
2,2,780,39.7%,58.72%
3,3,419,21.3%,61.34%
4,4,231,11.7%,66.23%
5,5,132,6.7%,72.73%
6,6,80,4.1%,73.75%
7,7,58,3.0%,77.59%
8,8,37,1.9%,89.19%
9,9,32,1.6%,93.75%


In [67]:
results_df_important, _ = evaluate_error_thresholds(
    model= xgb_reg_impt,
    X_test=X_test[important_features],
    y_test_error=y_test,      # y_test must be the REAL ERROR series
    thresholds=range(0, 11),
)

display(
    results_df_important.style.format(
        {"pct_of_test": "{:.1%}", "directional_accuracy": "{:.2%}"}
    )
)

Unnamed: 0,threshold_abs_pred_error_gt,n_games,pct_of_test,directional_accuracy
0,0,1966,100.0%,54.53%
1,1,1341,68.2%,57.20%
2,2,804,40.9%,58.33%
3,3,450,22.9%,63.11%
4,4,230,11.7%,64.35%
5,5,123,6.3%,68.29%
6,6,83,4.2%,71.08%
7,7,62,3.2%,75.81%
8,8,45,2.3%,82.22%
9,9,34,1.7%,91.18%


# Optuna

In [25]:
import numpy as np
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

LINE_COL = "TOTAL_OVER_UNDER_LINE"

# Keep your KFold
kf = KFold(n_splits=8, shuffle=True, random_state=16)

def ou_accuracy(y_true, y_pred, line):
    pred_diff = y_pred - line
    true_diff = y_true - line
    # pushes: sign(0)=0. With .5 lines it is rare anyway.
    return float(np.mean(np.sign(pred_diff) == np.sign(true_diff)))

def ou_accuracy_with_threshold(y_true, y_pred, line, threshold_abs=0.0, min_coverage=0.25):
    margin = np.abs(y_pred - line)
    mask = margin > threshold_abs
    coverage = float(np.mean(mask))
    if coverage < min_coverage:
        # Hard penalty if the strategy barely bets
        return 0.0, coverage
    if mask.sum() == 0:
        return 0.0, coverage
    return ou_accuracy(y_true[mask], y_pred[mask], line[mask]), coverage

def _predict_best(model, X):
    # Use best iteration if early stopping happened
    if getattr(model, "best_iteration", None) is not None:
        # Newer XGBoost
        try:
            return model.predict(X, iteration_range=(0, model.best_iteration + 1))
        except TypeError:
            # Older compatibility path
            ntree_limit = getattr(model, "best_ntree_limit", None)
            if ntree_limit is not None:
                return model.predict(X, ntree_limit=ntree_limit)
    return model.predict(X)

def objective(trial, X, y):
    threshold_abs = trial.suggest_float("threshold_abs", 0.0, 10.0, step=0.5)
    min_coverage = 0.25

    params = {
        "booster": "gbtree",
        "tree_method": "hist",
        "max_depth": trial.suggest_int("max_depth", 2, 7),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 30.0, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.0075, 0.2, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 50.0, log=True),

        "n_estimators": trial.suggest_int("n_estimators", 300, 2500, step=100),
        "early_stopping_rounds": 200,   # moved here
        "eval_metric": "rmse",          # optional, but explicit is good
        "random_state": 16,
        "n_jobs": -1,
    }

    fold_scores = []
    fold_coverages = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr = y.iloc[tr_idx].to_numpy()
        y_va = y.iloc[va_idx].to_numpy()

        model = XGBRegressor(**params)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False,
        )

        y_pred = _predict_best(model, X_va)
        line = X_va[LINE_COL].to_numpy()

        score, coverage = ou_accuracy_with_threshold(
            y_true=y_va,
            y_pred=y_pred,
            line=line,
            threshold_abs=threshold_abs,
            min_coverage=min_coverage,
        )

        fold_scores.append(score)
        fold_coverages.append(coverage)

        trial.report(float(np.mean(fold_scores)), step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    trial.set_user_attr("mean_coverage", float(np.mean(fold_coverages)))
    return float(np.mean(fold_scores))

# ----------------------------
# Run the search (2 to 3 hours)
# ----------------------------
# Make sure X_train includes TOTAL_OVER_UNDER_LINE, since you use it in the metric.
# X_train, y_train are your current train split.
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=16),
    pruner=MedianPruner(n_warmup_steps=2),
)

# Time budget: 3 hours. You can adjust to 2*3600 if you want.
study.optimize(lambda t: objective(t, X_train, y_train), timeout=4 * 3600, n_jobs=1)

print("Best value (CV OU success):", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(k, v)

# ----------------------------
# Train final model on full train set
# ----------------------------
best_params = study.best_params.copy()
best_threshold = best_params.pop("threshold_abs")  # strategy threshold

final_params = {
    "booster": "gbtree",
    "tree_method": "hist",
    "random_state": 16,
    "n_jobs": -1,
    **best_params,
}

final_model = XGBRegressor(**final_params)

[32m[I 2026-02-05 23:46:28,844][0m A new study created in memory with name: no-name-4e7a048f-0ed4-4989-9506-0ddde34ecef3[0m
[32m[I 2026-02-05 23:48:13,472][0m Trial 0 finished with value: 0.5623390331428632 and parameters: {'threshold_abs': 2.0, 'max_depth': 5, 'min_child_weight': 0.8219695696031055, 'gamma': 0.22800975066403162, 'subsample': 0.6803644176738863, 'colsample_bytree': 0.6115404708456444, 'learning_rate': 0.071971944311379, 'reg_alpha': 2.975656697106555e-07, 'reg_lambda': 3.478796625225186e-06, 'n_estimators': 2400}. Best is trial 0 with value: 0.5623390331428632.[0m
[32m[I 2026-02-05 23:49:13,815][0m Trial 1 finished with value: 0.0 and parameters: {'threshold_abs': 5.5, 'max_depth': 2, 'min_child_weight': 3.2561820715318097, 'gamma': 0.7922608676158321, 'subsample': 0.6251406533720116, 'colsample_bytree': 0.6467436280510988, 'learning_rate': 0.07385949950651723, 'reg_alpha': 0.0001507914760359121, 'reg_lambda': 4.526442350789216e-05, 'n_estimators': 1300}. Best 

Best value (CV OU success): 0.6008348976729305
Best params:
threshold_abs 3.0
max_depth 5
min_child_weight 6.764645919375405
gamma 0.5555438491302327
subsample 0.8997434347491664
colsample_bytree 0.8777753673030719
learning_rate 0.008198390348212792
reg_alpha 1.5541242347086836e-06
reg_lambda 0.00014480660336286608
n_estimators 2200


In [26]:
import joblib
final_model.fit(X_train, y_train, verbose=False)
final_model.save_model("xgb_ou_model.json")   # or .ubj in newer versions


bundle = {
    "model": final_model,
    "threshold_abs": best_threshold,
    "feature_names": list(X_train.columns),
    "line_col": LINE_COL,
}
joblib.dump(bundle, "ou_xgb_bundle.joblib")


['ou_xgb_bundle.joblib']

## Check in Test set

In [27]:
results_df_final, y_pred_test = evaluate_ou_thresholds(
    model=final_model,
    X_test=X_test,
    y_test=y_test,
    thresholds=range(0, 11)  # 0..10
)

# Pretty display
display(results_df_final.style.format({
    "pct_of_test": "{:.1%}",
    "ou_betting_accuracy": "{:.2%}",
}))


Unnamed: 0,threshold_abs_pred_minus_line_gt,n_games,pct_of_test,ou_betting_accuracy
0,0,1994,100.0%,52.51%
1,1,1589,79.7%,53.43%
2,2,1217,61.0%,53.41%
3,3,895,44.9%,55.08%
4,4,660,33.1%,56.52%
5,5,465,23.3%,57.42%
6,6,307,15.4%,59.93%
7,7,209,10.5%,62.68%
8,8,146,7.3%,65.75%
9,9,111,5.6%,68.47%


In [30]:
feature_importances = final_model.feature_importances_
important_features = np.argsort(feature_importances)[::-1]  
feature_importances = pd.DataFrame({
    'Feature': X_train.columns[important_features],
    'Importance': feature_importances[important_features]
}).sort_values(by="Importance", ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
0,total_draftkings_line_over,0.042220
1,total_fanduel_line_over,0.020681
2,odds_total_line_books_mean,0.019866
3,TOTAL_OVER_UNDER_LINE,0.005470
4,IMPLIED_PTS_HOME,0.003097
...,...,...
686,spread_betmgm_price_away__is_missing,0.000000
687,spread_betmgm_line_home__is_missing,0.000000
688,spread_betmgm_price_home__is_missing,0.000000
737,total_fanduel_line_over__is_missing,0.000000
