In [1]:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from nba_api.stats.static import teams as static_teams
from nba_api.stats.endpoints import BoxScoreFourFactorsV3, TeamGameLog, LeagueStandingsV3
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
from requests.exceptions import ReadTimeout, ConnectionError
import random, math
import pathlib

# Linear Regression
### Goal: Predict Final Record of NBA Teams
First, I will be using the Four Factors statistics (Effective Field Goal Percentage, Free Throw Rate, Turnover Percentage, Offensive Rebound Rate,
Opponent Effective Field Goal Percentage, Opponent Free Throw Rate, Opponent Turnover Percentage, Opponent rebound Rate), which are established to
be excellent predictors of team success, with a Linear Regression model trained on the first n games of a season, and then test on a different season
given the Four Factor stats from the first n games of that season. 

In [2]:
COMMON_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/124.0 Safari/537.36",
    "Accept": "application/json, text/plain, */*",
    "Origin": "https://www.nba.com",
    "Referer": "https://www.nba.com/",
}

CACHE_DIR = pathlib.Path("cache_pkl")
CACHE_DIR.mkdir(exist_ok=True)
NETWORK_CALLS = 0

# Reusable method to pull data from an endpoint of NBA_API, checks if the data has already been saved locally
def endpoint(endpoint_cls, *, cache_key=None, max_retries=3, base_sleep=1.5, **kwargs):
    global NETWORK_CALLS

    # Check if the data has already been retrieved
    if cache_key:
        path = CACHE_DIR / f"{cache_key}.pkl"
        if path.exists():
            frames = pd.read_pickle(path)

            stub = type("Stub", (), {})()
            stub.get_data_frames = lambda f=frames: f
            stub.team_stats = type("DS", (), {
                "get_data_frame": staticmethod(lambda df=frames[1] if len(frames) > 1 else frames[0]: df)
            })()
            stub._from_cache = True
            return stub
    
    # If not pull it from the API
    for attempt in range(max_retries):
        try:
            obj = endpoint_cls(
                headers=COMMON_HEADERS,
                timeout=60,
                **kwargs
            )
            NETWORK_CALLS += 1
            # Save the data to speed up subsequent program runs
            if cache_key:
                pd.to_pickle(obj.get_data_frames(),
                             CACHE_DIR / f"{cache_key}.pkl")
                    
            obj._from_cache = False
            return obj
        
        except (ReadTimeout, ConnectionError):
            wait = base_sleep * (2 ** attempt) + random.uniform(0, 0.5)
            print(f"{endpoint_cls.__name__} retry {attempt+1}/{max_retries}"
                  f" - sleeping {wait:.1f}s")
            time.sleep(wait)
            
    # Error if we hit max retries
    raise RuntimeError(f"{endpoint_cls.__name__} failed after {max_retries} retries")
            

In [3]:
# Function to get the game IDs from the training set
def get_game_ids(team_id, season, n_games):
    games = endpoint(
        TeamGameLog,
        cache_key=f"gamelog_{team_id}_{season}",
        team_id=team_id,
        season=season,
        season_type_all_star="Regular Season"
        ).get_data_frames()[0]
    return games.sort_values("GAME_DATE")["Game_ID"].head(n_games).tolist()

In [4]:


# Features we will be using
FEATURES = [
    "effectiveFieldGoalPercentage",
    "freeThrowAttemptRate",
    "teamTurnoverPercentage",
    "offensiveReboundPercentage",
    "oppEffectiveFieldGoalPercentage",
    "oppFreeThrowAttemptRate",
    "oppTeamTurnoverPercentage",
    "oppOffensiveReboundPercentage"
]

# Speed up data retrieval
MAX_THREADS = 4
PER_REQUEST_SLEEP = 0.5


# Get the features from the API endpoint
def get_features(game_id, team_id):
        stats = endpoint(
            BoxScoreFourFactorsV3,
            cache_key=f"bsff_{game_id}",
            game_id=game_id,
            start_period=1, start_range=0,
            end_period=1, end_range=0,
            range_type=0)
        team_df = stats.team_stats.get_data_frame()
        row = team_df.loc[team_df["teamId"] == team_id, FEATURES]
        # Avoid making the API mad
        if not getattr(stats, "_from_cache", False):
            time.sleep(PER_REQUEST_SLEEP)
        return row.squeeze().astype(float)
    


In [5]:
def combine_rows(team_id, season, n_games):
    ids = get_game_ids(team_id, season, n_games)
    rows = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        futures = [pool.submit(get_features, id, team_id) for id in ids]
        for future in as_completed(futures):
            rows.append(future.result())
    
    # Return the mean features for n_games
    return pd.concat(rows, axis=1).mean(axis=1)

In [6]:
def generate_df(season, n_games):
    records = []
    for team in static_teams.get_teams():
        row = combine_rows(team["id"], season, n_games)
        row["TEAM_NAME"] = team["full_name"]
        row["TEAM_ID"] = team["id"]
        records.append(row)
    return pd.DataFrame(records)

In [7]:
# Get the number of wins for each team
def get_wins(season, season_type="Regular Season"):
    df = endpoint(
        LeagueStandingsV3,
        cache_key=f"stand_{season}",
        season=season,
        season_type=season_type
    ).get_data_frames()[0]
    
    df = df.rename(columns={"TeamID" : "TEAM_ID",
                            "TeamName" : "TEAM_NAME",
                            "WINS" : "W"})
    return df.drop(columns=["TEAM_NAME"])

In [8]:
def generate_season_matrix(season, n_games):
    features = generate_df(season, n_games)
    wins = get_wins(season)
    df = features.merge(wins, on="TEAM_ID", how="inner")
    df.dropna(subset=FEATURES, inplace=True)
    return df
    

In [9]:
# Function to fit a linear regression model to the training data
def train_lr_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [10]:
# Function to valuate the fit of the Linear Regression model
def evaluate_model_lr(model, X_test, y_test):
    score = model.score(X_test, y_test)
    print(f"R^2 Stat on test set: {score: .4f}")

In [24]:
# Plot with Plotly for interactivity
def plotly_plot_lr(df, actual_col="Actual_Wins", predicted_col="Predicted_Wins"):
    fig = px.scatter(
        df,
        x=actual_col,
        y=predicted_col,
        hover_data=["TEAM_NAME"],
        labels={actual_col: "Actual Wins", predicted_col: "Predicted Wins"},
        title="Actual Wins vs. Predicted Wins (Test Season: 2023-24)"
    )
    
    # Plot regression
    min_val = min(df[actual_col].min(), df[predicted_col].min())
    max_val = max(df[actual_col].max(), df[predicted_col].max())
    fig.add_shape(
        type="line",
        x0=min_val, y0=min_val, x1=max_val, y1=max_val,
        line=dict(color="red", dash="dash")
    )

    fig.show()

In [12]:
# Generate final team record predictions from Linear Regression Model
def predict_final_record_lr(model, df, features=FEATURES):

    X_current = df[features]
    predicted_wins = model.predict(X_current)

    results_df = pd.DataFrame({
        "TEAM_NAME": df["TEAM_NAME"],
        "Predicted_Wins": predicted_wins
    })

    # Round prediction to nearest integer
    results_df["Predicted_Wins"] = results_df["Predicted_Wins"].round().astype(int)
    results_df["Predicted_Losses"] = 82 - results_df["Predicted_Wins"]
    
    return results_df

In [None]:
N_GAMES = 20
# Train Model
train_df = generate_season_matrix("2022-23", N_GAMES)
X_train = train_df[FEATURES]
y_train = train_df["W"]

model = train_lr_model(X_train, y_train)

test_df = generate_season_matrix("2023-24", N_GAMES)
X_test = test_df[FEATURES]
y_test = test_df["W"]


# Generate Predictions
y_pred = model.predict(X_test).round()

result = test_df[["TEAM_NAME", "W"]].copy()
result["Predicted_Wins"] = y_pred.astype(int)
result["Predicted_Losses"] = 82 - result["Predicted_Wins"]

print("Network Calls: ", NETWORK_CALLS)
# Get R^2
r2 = model.score(X_test, y_test)

print(f"R2 on 2023-2024 data: {r2:.2f}")

# Generate plot
plotly_plot_lr(result, actual_col="W", predicted_col="Predicted_Wins")

# Print chart of Predicted vs. Actual Wins
show_df = result.rename(columns={"W" : "Actual_Wins"}).sort_values("Actual_Wins", ascending=False)
print(show_df.to_string(index=False))


Network Calls:  0
R2 on 2023-2024 data: 0.7625


             TEAM_NAME  Actual_Wins  Predicted_Wins  Predicted_Losses
        Boston Celtics           64              69                13
        Denver Nuggets           57              60                22
 Oklahoma City Thunder           57              59                23
Minnesota Timberwolves           56              53                29
  Los Angeles Clippers           51              50                32
      Dallas Mavericks           50              46                36
       New York Knicks           50              43                39
          Phoenix Suns           49              44                38
       Milwaukee Bucks           49              54                28
  New Orleans Pelicans           49              48                34
   Cleveland Cavaliers           48              43                39
        Indiana Pacers           47              47                35
    Los Angeles Lakers           47              51                31
    Philadelphia 76e

In [29]:
def fit_model_and_evaluate(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    r2 = model.score(X_test, y_test)
    print(f"{name}:  R^2 Score: {r2:.3f}")
    result_df = predict_final_record_lr(model, test_df)
    result_df["Actual_Wins"] = y_test.values

    plotly_plot_lr(result_df)
    
    return model

In [30]:
ols = fit_model_and_evaluate("Linear Regression", LinearRegression(), X_train, y_train, X_test, y_test)


Linear Regression:  R^2 Score: 0.762


### Linear Model Evaluation
It appears that, at least with a linear model, Effective Field Goal Percentage is by far the most important feature being used in the predictions, followed by Opponent Effective Field Goal Percentage. This makes sense as points scored vs points allowed would be the most directly linked variable related to the outcome of games, and efficiency in scoring and defending should make a big difference in that.

The coefficients show that the features are strongly correlated with wins (with the exception of free throw rates).

In [None]:
import statsmodels.api as sm

X_sm = sm.add_constant(train_df[FEATURES])
ols = sm.OLS(train_df["W"], X_sm).fit()
print(ols.summary())



                            OLS Regression Results                            
Dep. Variable:                      W   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.670
Method:                 Least Squares   F-statistic:                     8.362
Date:                Wed, 30 Apr 2025   Prob (F-statistic):           4.59e-05
Time:                        14:39:29   Log-Likelihood:                -89.735
No. Observations:                  30   AIC:                             197.5
Df Residuals:                      21   BIC:                             210.1
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

def evaluate_model(model, name, cv=5, verbose=True):
    cv_r2 = cross_val_score(model, X_train, y_train, cv=cv, scoring="r2").mean()
    model.fit(X_train, y_train)
    test_r2 = r2_score(y_test, model.predict(X_test))
    if verbose:
        print(f"{name or type(model[-1]).__name__:12s}"
              f"CV R2 = {cv_r2:.3f}\n"
              f"Test R2 = {test_r2:.3f}")
    return cv_r2, test_r2

In [21]:
ols = make_pipeline(StandardScaler(), LinearRegression())
evaluate_model(ols, "OLS")

OLS         CV R2 = 0.236
Test R2 = 0.762


(np.float64(0.23560107832573762), 0.762494687728835)

# Ridge Regression
Now I will fit a Ridge Regression model to compare to the OLS model. This will standardize the features and correct for large amounts of colinearity.

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import make_pipeline
# Streamline Model evaluation using sklearn Pipelines
pipelines = {
    "OLS" : make_pipeline(StandardScaler(), LinearRegression()),
    "Ridge" : make_pipeline(StandardScaler(), RidgeCV(alphas=[0.1,1,10], cv=5))
}
# Generate our Ridge Regression model, find the best alpha
ridge = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5)
).fit(train_df[FEATURES], train_df["W"])

print