# Classification NBA Model

## Configuration

In [1]:
from pathlib import Path

save_models_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/"
Path(save_models_path).mkdir(parents=True, exist_ok=True)


## Imports

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from nba_ou.data_preparation.missing_data.handle_missing_data import (
    apply_missing_policy,
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold, cross_validate, train_test_split
from xgboost import XGBClassifier


## Load Data

In [3]:
data_path = "/home/adrian_alvarez/Projects/NBA_over_under_predictor/data/train_data/"
name = "all_odds_training_data_until_20260120.csv"

path = data_path + name

df_stats = pd.read_csv(path)

dtype_dict = {col: str for col in df_stats.columns if "ID" in col.upper()}

df_stats = pd.read_csv(
    path,
    dtype=dtype_dict
)
df_stats['GAME_DATE'] = pd.to_datetime(df_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')

  df_stats = pd.read_csv(path)
  df_stats = pd.read_csv(


In [10]:
from nba_ou.data_preparation.missing_data.clean_df_for_training import (
    clean_dataframe_for_training
)
# df_to_train = clean_dataframe_for_training(df_stats, nan_threshold=4, drop_all_na_rows=True, verbose=1)
df_to_train = clean_dataframe_for_training(df_stats, nan_threshold=3, drop_all_na_rows=True, verbose=1)


STARTING DATAFRAME CLEANING PIPELINE
Starting basic cleaning with 10870 rows
Basic cleaning complete: 8050 rows remaining

Starting advanced column cleaning with 1805 columns

Advanced column cleaning complete: 1805 → 816 columns (989 removed)


Dropping NA rows for SEASON_YEAR 2017...
   Removed 0 rows with NaN values from 2017 season

Applying missing data policy...

Missing Data Policy Report:
  Rows dropped: 0 (0.0%)
  Critical columns requiring data: 5
  Columns zero-filled: 132
  Infer pairs applied: 54/228
  Remaining NaN cells: 14638

Dropping rows that are all NaN...
CLEANING COMPLETE
Final shape: (7194, 816)


In [11]:
# Count NAs per column
na_counts = df_to_train.isna().sum()

# Get most common SEASON_YEAR for nulls in each column
most_common_season = []
for col in df_to_train.columns:
    if na_counts[col] > 0:
        # Get rows where this column is null
        null_rows = df_stats[df_stats[col].isna()]
        if len(null_rows) > 0 and 'SEASON_YEAR' in df_stats.columns:
            # Find most common SEASON_YEAR for these null rows
            common_season = null_rows['SEASON_YEAR'].mode()
            most_common_season.append(common_season.iloc[0] if len(common_season) > 0 else None)
        else:
            most_common_season.append(None)
    else:
        most_common_season.append(None)

na_counts_df = pd.DataFrame({
    'Column': na_counts.index,
    'NA_Count': na_counts.values,
    'NA_Percentage': (na_counts.values / len(df_to_train) * 100).round(2),
    'Most_Common_Season_Year': most_common_season
}).sort_values('NA_Count', ascending=False)

# Show only columns with NAs
na_counts_df[na_counts_df['NA_Count'] > 0]

Unnamed: 0,Column,NA_Count,NA_Percentage,Most_Common_Season_Year


In [12]:
df_to_train = df_to_train[df_to_train['TOTAL_POINTS'] != df_to_train['TOTAL_OVER_UNDER_LINE']]

In [13]:
df_to_train['OVER_UNDER'] = np.where(df_to_train['TOTAL_POINTS'] >= df_to_train['TOTAL_OVER_UNDER_LINE'], 1, 0) # 1 if over, 0 if under

In [14]:
#count 1 and 0 in OVER_UNDER
df_to_train['OVER_UNDER'].value_counts()

OVER_UNDER
0    3695
1    3413
Name: count, dtype: int64

In [15]:
# df_to_train= df_to_train[df_to_train['SEASON_YEAR'] != 2018]

## Train / Test

In [16]:
X = df_to_train.drop(['TOTAL_POINTS', 'OVER_UNDER', 'SEASON_YEAR'], axis=1, errors='ignore')
y = df_to_train['OVER_UNDER']

In [17]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

In [18]:
df_to_train['IS_TRAINING_DATA'] = False

# Mark True for the rows in the training set
df_to_train.loc[X_train.index, 'IS_TRAINING_DATA'] = True
# output_name = f"{data_path}/training_data_with_missing_data_handled_from_2004-10-01_to_2026-01-10_classifier.csv"
# df_to_train.to_csv(output_name, index=False)

In [19]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
# Check number of coulmns
print(f"Number of columns in training set: {X_train.shape[1]}")
print(f"Number of columns in test set: {X_test.shape[1]}")

Training set size: 5331
Test set size: 1777
Number of columns in training set: 814
Number of columns in test set: 814


## Cross-validation

In [20]:
# Declare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=16)

In [21]:
# Declare scores to be used
scoring = {
    'Acc': make_scorer(accuracy_score),
    'Prec': make_scorer(precision_score),
    'Rec': make_scorer(recall_score),
    'Bacc': make_scorer(balanced_accuracy_score),
}

In [22]:

def print_classification_metrics(cv_results, scoring):
    """
    Prints the mean cross-validation scores for each metric on train and validation folds.
    """
    for sc in scoring.keys():
        print(f"Train {sc}:", cv_results[f'train_{sc}'].mean().round(5))
        print(f"Validation {sc}:", cv_results[f'test_{sc}'].mean().round(5))
        print()
    

## Baseline

In [23]:
dummy_clf = DummyClassifier(strategy='most_frequent')
cv_results = cross_validate(
    dummy_clf, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True
)
dummy_clf.fit(X_train, y_train)
print("=== Dummy (Most Frequent) Classifier ===")
print_classification_metrics(cv_results, scoring)

=== Dummy (Most Frequent) Classifier ===
Train Acc: 0.52129
Validation Acc: 0.52129

Train Prec: 0.0
Validation Prec: 0.0

Train Rec: 0.0
Validation Rec: 0.0

Train Bacc: 0.5
Validation Bacc: 0.5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [24]:
lr = LogisticRegression(max_iter=1000)
cv_results = cross_validate(
    lr, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True,
    n_jobs=-1
)
lr.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [25]:
print("=== Logistic Regression ===")
print_classification_metrics(cv_results, scoring)

=== Logistic Regression ===
Train Acc: 0.65063
Validation Acc: 0.53236

Train Prec: 0.64387
Validation Prec: 0.51212

Train Rec: 0.60438
Validation Rec: 0.49254

Train Bacc: 0.64871
Validation Bacc: 0.53074



In [28]:
xgb_clf = XGBClassifier(
    max_depth=4,
    learning_rate=0.035,
    n_estimators=300,
    subsample=0.6,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=16,
    reg_lambda=1.0,
)

cv_results = cross_validate(
    xgb_clf, 
    X_train, 
    y_train, 
    cv=kf, 
    scoring=scoring, 
    return_train_score=True,
    n_jobs=-1
)

xgb_clf.fit(X_train, y_train)

print("=== XGBClassifier ===")
print_classification_metrics(cv_results, scoring)

=== XGBClassifier ===
Train Acc: 0.96619
Validation Acc: 0.51847

Train Prec: 0.97498
Validation Prec: 0.49676

Train Rec: 0.95385
Validation Rec: 0.43028

Train Bacc: 0.96568
Validation Bacc: 0.51497



In [29]:
# Calcualte it in test set
y_pred = xgb_clf.predict(X_test)

#Predict test set with XGBClassifier
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
balan_acc = balanced_accuracy_score(y_test, y_pred)
print("=== XGBClassifier Test Set Performance ===")
print(f"Accuracy: {accuracy:.5f}")
print(f"Precision: {precision:.5f}")    
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")
print(f"Balanced Accuracy: {balan_acc:.5f}")

#groupby season_year and calculate accuracy per season
df_test = X_test.copy()
df_test['TRUE_LABEL'] = y_test
df_test['PRED_LABEL'] = y_pred
df_test['CORRECT_PREDICTION'] = np.where(df_test['TRUE_LABEL'] == df_test['PRED_LABEL'], 1, 0)
#merge 'SEASON_YEAR' from df_to_train
df_test = df_test.merge(df_to_train[['SEASON_YEAR']], left_index=True, right_index=True, how='left')
season_accuracy = df_test.groupby('SEASON_YEAR')['CORRECT_PREDICTION'].mean().reset_index()
season_accuracy.rename(columns={'CORRECT_PREDICTION': 'ACCURACY'}, inplace=True)
print("=== Test Set Accuracy by Season ===")
print(season_accuracy)

=== XGBClassifier Test Set Performance ===
Accuracy: 0.50985
Precision: 0.49356
Recall: 0.44483
F1 Score: 0.46793
Balanced Accuracy: 0.50790
=== Test Set Accuracy by Season ===
   SEASON_YEAR  ACCURACY
0         2019  0.491667
1         2020  0.449275
2         2021  0.590476
3         2022  0.507194
4         2023  0.493197
5         2024  0.541547
6         2025  0.427586


## AutoGluon (Hyperparameter Tuning)

In [22]:
predictor = TabularPredictor(
    label="target",
    problem_type="binary",
    eval_metric="balanced_accuracy",
    path=save_models_path,
).fit(
    train_data=X_train.assign(target=y_train),
    time_limit=3*3600,  # 3 hours
    presets="good_quality",
    hyperparameter_tune_kwargs="auto",
    num_cpus=12,
    # hyperparameters={
    #     "XGB": {},  # only XGBoost models
    # },
)


2026-02-08 21:34:06,017	INFO timeout.py:54 -- Reached timeout of 480.8754911899567 seconds. Stopping all trials.
2026-02-08 21:34:06,029	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L2' in 0.0092s.
- 1271de59: FileNotFoundError('Could not fetch metrics for 1271de59: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L2/1271de59')
- 8964517e: FileNotFoundError('Could not fetch metrics for 8964517e: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/NeuralNetTorch_BAG_L2/8964517e')
- 634e5bce: FileNotFoundError('Could not fetch metrics for 634e5bce: both result.json and progress.csv were not found at /home/adrian_alvarez/Projects/NBA_o

In [23]:
lb = predictor.leaderboard(silent=True)
lb


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.603549,balanced_accuracy,10.317663,1671.205706,0.000692,0.223487,3,False,87
1,NeuralNetTorch_BAG_L2/1271de59,0.601382,balanced_accuracy,8.013409,1123.554455,1.110602,107.525533,2,False,83
2,NeuralNetTorch_BAG_L2/8964517e,0.589686,balanced_accuracy,8.311312,1313.265086,1.408505,297.236163,2,False,84
3,LightGBMXT_BAG_L2/T5,0.572185,balanced_accuracy,7.344771,1076.216113,0.441964,60.187191,2,False,52
4,LightGBM_BAG_L2/T2,0.565058,balanced_accuracy,7.104512,1174.885820,0.201705,158.856898,2,False,55
...,...,...,...,...,...,...,...,...,...,...
169,CatBoost_BAG_L2/T2_FULL,,balanced_accuracy,,78.609262,,1.053865,2,True,147
170,CatBoost_BAG_L2/T1_FULL,,balanced_accuracy,,79.782542,,2.227145,2,True,146
171,CatBoost_BAG_L1/T3_FULL,,balanced_accuracy,,13.497491,,13.497491,1,True,105
172,CatBoost_BAG_L1/T2_FULL,,balanced_accuracy,,2.455070,,2.455070,1,True,104


In [24]:
# Keep only the best model (+ its dependencies), delete the rest from disk
predictor.delete_models(models_to_keep="best", delete_from_disk=True)

# (Optional) shrink further by removing auxiliary artifacts not needed for prediction
predictor.save_space(remove_data=True)

# Save back to disk (same folder); after this, the directory is much smaller
predictor.save()

Deleting model LightGBMXT_BAG_L1/T1. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T1 will be removed.
Deleting model LightGBMXT_BAG_L1/T2. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T2 will be removed.
Deleting model LightGBMXT_BAG_L1/T3. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T3 will be removed.
Deleting model LightGBMXT_BAG_L1/T4. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T4 will be removed.
Deleting model LightGBMXT_BAG_L1/T5. All files under /home/adrian_alvarez/Projects/NBA_over_under_predictor/models/classifier_autoglue_1/models/LightGBMXT_BAG_L1/T5 will be removed.
Deleting model LightGBMXT_BAG_L1/T6. All files under /home/adrian_alvarez/Projects/NBA_ove

## Test Set Evaluation (Best Model)

In [25]:

y_test_pred = predictor.predict(X_test)

metrics = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred),
    "f1": f1_score(y_test, y_test_pred),
    "error_rate": 1 - accuracy_score(y_test, y_test_pred),
}

pd.Series(metrics).round(4)


accuracy             0.5253
balanced_accuracy    0.5261
precision            0.5046
recall               0.5452
f1                   0.5241
error_rate           0.4747
dtype: float64