# Classification NBA Model

## Configuration

In [4]:
from pathlib import Path

save_models_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/"
Path(save_models_path).mkdir(parents=True, exist_ok=True)


## Imports

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from nba_ou.data_preparation.missing_data.handle_missing_data import (
    apply_missing_policy,
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold, cross_validate, train_test_split
from xgboost import XGBClassifier


## Load Data

In [6]:
data_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/data/train_data/"
name = "all_odds_training_data_until_20260110.csv"

path = data_path + name

df_stats = pd.read_csv(path)

dtype_dict = {col: str for col in df_stats.columns if "ID" in col.upper()}

df_stats = pd.read_csv(
    path,
    dtype=dtype_dict
)
df_stats['GAME_DATE'] = pd.to_datetime(df_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')

  df_stats = pd.read_csv(path)
  df_stats = pd.read_csv(


In [7]:
from nba_ou.data_preparation.missing_data.clean_df_for_training import (
    clean_dataframe_for_training
)
df_to_train = clean_dataframe_for_training(df_stats, nan_threshold=4, drop_all_na_rows=True, verbose=1)


STARTING DATAFRAME CLEANING PIPELINE
Starting basic cleaning with 10794 rows
Basic cleaning complete: 7976 rows remaining

Starting advanced column cleaning with 1133 columns

Advanced column cleaning complete: 1133 → 660 columns (473 removed)


Applying missing data policy...

Missing Data Policy Report:
  Rows dropped: 0 (0.0%)
  Critical columns requiring data: 5
  Columns zero-filled: 132
  Infer pairs applied: 54/228
  Remaining NaN cells: 144

Dropping rows that are all NaN...
CLEANING COMPLETE
Final shape: (7957, 660)


In [8]:
# Count NAs per column
na_counts = df_to_train.isna().sum()

# Get most common SEASON_YEAR for nulls in each column
most_common_season = []
for col in df_to_train.columns:
    if na_counts[col] > 0:
        # Get rows where this column is null
        null_rows = df_stats[df_stats[col].isna()]
        if len(null_rows) > 0 and 'SEASON_YEAR' in df_stats.columns:
            # Find most common SEASON_YEAR for these null rows
            common_season = null_rows['SEASON_YEAR'].mode()
            most_common_season.append(common_season.iloc[0] if len(common_season) > 0 else None)
        else:
            most_common_season.append(None)
    else:
        most_common_season.append(None)

na_counts_df = pd.DataFrame({
    'Column': na_counts.index,
    'NA_Count': na_counts.values,
    'NA_Percentage': (na_counts.values / len(df_to_train) * 100).round(2),
    'Most_Common_Season_Year': most_common_season
}).sort_values('NA_Count', ascending=False)

# Show only columns with NAs
na_counts_df[na_counts_df['NA_Count'] > 0]

Unnamed: 0,Column,NA_Count,NA_Percentage,Most_Common_Season_Year


In [9]:
df_to_train = df_to_train[df_to_train['TOTAL_POINTS'] != df_to_train['TOTAL_OVER_UNDER_LINE']]

In [10]:
df_to_train['OVER_UNDER'] = np.where(df_to_train['TOTAL_POINTS'] >= df_to_train['TOTAL_OVER_UNDER_LINE'], 1, 0) # 1 if over, 0 if under

In [11]:
#count 1 and 0 in OVER_UNDER
df_to_train['OVER_UNDER'].value_counts()

OVER_UNDER
0    4070
1    3792
Name: count, dtype: int64

In [12]:
df_to_train= df_to_train[df_to_train['SEASON_YEAR'] != 2018]

## Train / Test

In [13]:
X = df_to_train.drop(['TOTAL_POINTS', 'OVER_UNDER', 'SEASON_YEAR'], axis=1, errors='ignore')
y = df_to_train['OVER_UNDER']

In [14]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

In [15]:
df_to_train['IS_TRAINING_DATA'] = False

# Mark True for the rows in the training set
df_to_train.loc[X_train.index, 'IS_TRAINING_DATA'] = True
# output_name = f"{data_path}/training_data_with_missing_data_handled_from_2004-10-01_to_2026-01-10_classifier.csv"
# df_to_train.to_csv(output_name, index=False)

In [16]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Check number of coulmns
print(f"Number of columns in training set: {X_train.shape[1]}")
print(f"Number of columns in test set: {X_test.shape[1]}")

Training set size: 5883
Test set size: 1962
Number of columns in training set: 658
Number of columns in test set: 658


## Cross-validation

In [17]:
# Declare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=16)

In [18]:
# Declare scores to be used
scoring = {
    'Acc': make_scorer(accuracy_score),
    'Prec': make_scorer(precision_score),
    'Rec': make_scorer(recall_score),
    'Bacc': make_scorer(balanced_accuracy_score),
}

In [19]:

def print_classification_metrics(cv_results, scoring):
    """
    Prints the mean cross-validation scores for each metric on train and validation folds.
    """
    for sc in scoring.keys():
        print(f"Train {sc}:", cv_results[f'train_{sc}'].mean().round(5))
        print(f"Validation {sc}:", cv_results[f'test_{sc}'].mean().round(5))
        print()
    

## Baseline

In [20]:
dummy_clf = DummyClassifier(strategy='most_frequent')
cv_results = cross_validate(
    dummy_clf, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True
)
dummy_clf.fit(X_train, y_train)
print("=== Dummy (Most Frequent) Classifier ===")
print_classification_metrics(cv_results, scoring)

=== Dummy (Most Frequent) Classifier ===
Train Acc: 0.50518
Validation Acc: 0.50518

Train Prec: 0.0
Validation Prec: 0.0

Train Rec: 0.0
Validation Rec: 0.0

Train Bacc: 0.5
Validation Bacc: 0.5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [21]:
lr = LogisticRegression(max_iter=1000)
cv_results = cross_validate(
    lr, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True,
    n_jobs=-1
)
lr.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [22]:
print("=== Logistic Regression ===")
print_classification_metrics(cv_results, scoring)

=== Logistic Regression ===
Train Acc: 0.63148
Validation Acc: 0.53068

Train Prec: 0.62813
Validation Prec: 0.52602

Train Rec: 0.62551
Validation Rec: 0.52804

Train Bacc: 0.63137
Validation Bacc: 0.53094



In [24]:
xgb_clf = XGBClassifier(
    max_depth=5,
    learning_rate=0.025,
    n_estimators=350,
    subsample=0.6,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=16
)

cv_results = cross_validate(
    xgb_clf, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True,
    n_jobs=-1
)

xgb_clf.fit(X_train, y_train)

print("=== XGBClassifier ===")
print_classification_metrics(cv_results, scoring)

=== XGBClassifier ===
Train Acc: 0.98955
Validation Acc: 0.51997

Train Prec: 0.99046
Validation Prec: 0.51556

Train Rec: 0.98841
Validation Rec: 0.51312

Train Bacc: 0.98954
Validation Bacc: 0.52021



In [26]:
# Calcualte it in test set
y_pred = xgb_clf.predict(X_test)

#Predict test set with XGBClassifier
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
balan_acc = balanced_accuracy_score(y_test, y_pred)
print("=== XGBClassifier Test Set Performance ===")
print(f"Accuracy: {accuracy:.5f}")
print(f"Precision: {precision:.5f}")    
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")
print(f"Balanced Accuracy: {balan_acc:.5f}")

#groupby season_year and calculate accuracy per season
df_test = X_test.copy()
df_test['TRUE_LABEL'] = y_test
df_test['PRED_LABEL'] = y_pred
df_test['CORRECT_PREDICTION'] = np.where(df_test['TRUE_LABEL'] == df_test['PRED_LABEL'], 1, 0)
#merge 'SEASON_YEAR' from df_to_train
df_test = df_test.merge(df_to_train[['SEASON_YEAR']], left_index=True, right_index=True, how='left')
season_accuracy = df_test.groupby('SEASON_YEAR')['CORRECT_PREDICTION'].mean().reset_index()
season_accuracy.rename(columns={'CORRECT_PREDICTION': 'ACCURACY'}, inplace=True)
print("=== Test Set Accuracy by Season ===")
print(season_accuracy)

=== XGBClassifier Test Set Performance ===
Accuracy: 0.52345
Precision: 0.46638
Recall: 0.50115
F1 Score: 0.48314
Balanced Accuracy: 0.52122
=== Test Set Accuracy by Season ===
   SEASON_YEAR  ACCURACY
0         2019  0.504505
1         2020  0.547619
2         2021  0.528875
3         2022  0.543689
4         2023  0.493243
5         2024  0.476839
6         2025  0.627586


## AutoGluon (Hyperparameter Tuning)

In [24]:
predictor = TabularPredictor(
    label="target",
    problem_type="binary",
    eval_metric="balanced_accuracy",
    path=save_models_path,
).fit(
    train_data=X_train.assign(target=y_train),
    time_limit=14_000,
    presets="good_quality",
    hyperparameter_tune_kwargs="auto",
    num_cpus=12,
    # hyperparameters={
    #     "XGB": {},  # only XGBoost models
    # },
)


2026-02-03 23:36:07,864	INFO timeout.py:54 -- Reached timeout of 936.6289839831265 seconds. Stopping all trials.
2026-02-03 23:36:07,880	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L1' in 0.0128s.
- 61e10f1f: FileNotFoundError('Could not fetch metrics for 61e10f1f: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L1/61e10f1f')
- 3f7e583d: FileNotFoundError('Could not fetch metrics for 3f7e583d: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L1/3f7e583d')
- 35f2f9bb: FileNotFoundError('Could not fetch metrics for 35f2f9bb: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_o

In [25]:
lb = predictor.leaderboard(silent=True)
lb


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost_BAG_L1/T2,0.558067,balanced_accuracy,0.173583,72.707236,0.173583,72.707236,1,False,67
1,WeightedEnsemble_L2,0.558067,balanced_accuracy,0.174298,72.803975,0.000715,0.096740,2,False,89
2,NeuralNetTorch_BAG_L1/0de333c1,0.557884,balanced_accuracy,0.132553,58.573261,0.132553,58.573261,1,False,80
3,XGBoost_BAG_L1/T4,0.556623,balanced_accuracy,0.179575,42.524199,0.179575,42.524199,1,False,69
4,NeuralNetTorch_BAG_L1/6bb37649,0.555477,balanced_accuracy,0.884679,83.666442,0.884679,83.666442,1,False,86
...,...,...,...,...,...,...,...,...,...,...
173,LightGBMXT_BAG_L1/T10_FULL,,balanced_accuracy,,3.474485,,3.474485,1,True,99
174,LightGBMLarge_BAG_L1_FULL,,balanced_accuracy,,14.147013,,14.147013,1,True,177
175,CatBoost_BAG_L1/T3_FULL,,balanced_accuracy,,28.464873,,28.464873,1,True,124
176,CatBoost_BAG_L1/T2_FULL,,balanced_accuracy,,1.886889,,1.886889,1,True,123


In [26]:
# Keep only the best model (+ its dependencies), delete the rest from disk
predictor.delete_models(models_to_keep="best", delete_from_disk=True)

# (Optional) shrink further by removing auxiliary artifacts not needed for prediction
predictor.save_space(remove_data=True)

# Save back to disk (same folder); after this, the directory is much smaller
predictor.save()

Deleting model LightGBMXT_BAG_L1/T1. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T1 will be removed.
Deleting model LightGBMXT_BAG_L1/T2. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T2 will be removed.
Deleting model LightGBMXT_BAG_L1/T3. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T3 will be removed.
Deleting model LightGBMXT_BAG_L1/T4. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T4 will be removed.
Deleting model LightGBMXT_BAG_L1/T5. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T5 will be removed.
Deleting model LightGBMXT_BAG_L1/T6. All files under /home/adrian_alvarez/Projects/NBA_ove

## Test Set Evaluation (Best Model)

In [27]:

y_test_pred = predictor.predict(X_test)

metrics = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred),
    "f1": f1_score(y_test, y_test_pred),
    "error_rate": 1 - accuracy_score(y_test, y_test_pred),
}

pd.Series(metrics).round(4)


accuracy             0.5213
balanced_accuracy    0.5182
precision            0.4908
recall               0.4651
f1                   0.4776
error_rate           0.4787
dtype: float64