In [137]:
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from nba_api.stats.static import teams as static_teams
from nba_api.stats.endpoints import BoxScoreFourFactorsV3, TeamGameLog, LeagueStandingsV3
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
from requests.exceptions import ReadTimeout, ConnectionError
import random, math

In [138]:
COMMON_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/124.0 Safari/537.36",
    "Accept": "application/json, text/plain, */*",
    "Origin": "https://www.nba.com",
    "Referer": "https://www.nba.com/",
}

def endpoint(endpoint_cls, max_retries=3, base_sleep=1.5, **kwargs):
    for attempt in range(max_retries):
        try:
            return endpoint_cls(
                headers=COMMON_HEADERS,
                timeout=60,
                **kwargs
            )
        except (ReadTimeout, ConnectionError):
            wait = base_sleep * (2 ** attempt) + random.uniform(0, 0.5)
            print(f"{endpoint_cls.__name__} retry {attempt+1}/{max_retries}"
                  f" - sleeping {wait:.1f}s")
            time.sleep(wait)
            
        # Error if we hit max retries
        raise RuntimeError(f"{endpoint_cls.__name__} failed after {max_retries} retries")
            

In [139]:
# Function to get the game IDs from the training set
def get_game_ids(team_id, season, n_games):
    games = endpoint(
        TeamGameLog,
        team_id=team_id,
        season=season,
        season_type_all_star="Regular Season"
        ).get_data_frames()[0]
    return games.sort_values("GAME_DATE")["Game_ID"].head(n_games).tolist()

In [140]:


# Features we will be using
FEATURES = [
    "effectiveFieldGoalPercentage",
    "freeThrowAttemptRate",
    "teamTurnoverPercentage",
    "offensiveReboundPercentage",
    "oppEffectiveFieldGoalPercentage",
    "oppFreeThrowAttemptRate",
    "oppTeamTurnoverPercentage",
    "oppOffensiveReboundPercentage"
]

# Speed up data retrieval
MAX_THREADS = 4
PER_REQUEST_SLEEP = 0.5


# Get the features from the API endpoint
def get_features(game_id, team_id):
    try:
        stats = endpoint(
            BoxScoreFourFactorsV3,
            game_id=game_id,
            start_period=1, start_range=0,
            end_period=1, end_range=0,
            range_type=0)
        team_df = stats.team_stats.get_data_frame()
        row = team_df.loc[team_df["teamId"] == team_id, FEATURES]
        return row.squeeze().astype(float)
    finally:
        # Avoid making the API mad
        time.sleep(PER_REQUEST_SLEEP)


In [141]:
def combine_rows(team_id, season, n_games):
    ids = get_game_ids(team_id, season, n_games)
    rows = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        futures = [pool.submit(get_features, id, team_id) for id in ids]
        for future in as_completed(futures):
            rows.append(future.result())
        #time.sleep(0.6)
    
    # Return the mean features for n_games
    return pd.concat(rows, axis=1).mean(axis=1)

In [142]:
def generate_df(season, n_games):
    records = []
    for team in static_teams.get_teams():
        row = combine_rows(team["id"], season, n_games)
        row["TEAM_NAME"] = team["full_name"]
        row["TEAM_ID"] = team["id"]
        records.append(row)
    return pd.DataFrame(records)

In [143]:
# Get the number of wins for each team
def get_wins(season, season_type="Regular Season"):
    df = endpoint(
        LeagueStandingsV3,
        season=season,
        season_type=season_type
    ).get_data_frames()[0]
    
    df = df.rename(columns={"TeamID" : "TEAM_ID",
                            "TeamName" : "TEAM_NAME",
                            "WINS" : "W"})
    return df[["TEAM_ID", "TEAM_NAME", "W"]].astype({"W" : int})

In [144]:
def generate_season_matrix(season, n_games):
    features = generate_df(season, n_games)
    wins = get_wins(season)
    df = features.merge(wins, on="TEAM_ID", how="inner")
    df.dropna(subset=FEATURES, inplace=True)
    return df
    

In [145]:
# Function to fit a linear regression model to the training data
def train_lr_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [146]:
# Function to valuate the fit of the Linear Regression model
def evaluate_model_lr(model, X_test, y_test):
    score = model.score(X_test, y_test)
    print(f"R^2 Stat on test set: {score: .4f}")

In [147]:
# Plot with Plotly for interactivity
def plotly_plot_lr(df, actual_col="W", predicted_col="Predicted_Wins"):
    fig = px.scatter(
        df,
        x=actual_col,
        y=predicted_col,
        hover_data=["TEAM_NAME"],
        labels={actual_col: "Actual Wins", predicted_col: "Predicted Wins"},
        title="Actual Wins vs. Predicted Wins (Test Season: 2023-24)"
    )
    
    # Plot regression
    min_val = min(df[actual_col].min(), df[predicted_col].min())
    max_val = max(df[actual_col].max(), df[predicted_col].max())
    fig.add_shape(
        type="line",
        x0=min_val, y0=min_val, x1=max_val, y1=max_val,
        line=dict(color="red", dash="dash")
    )

    fig.show()

In [148]:
# Generate final team record predictions from Linear Regression Model
def predict_final_record_lr(model, df, features=FEATURES):

    X_current = df[features]
    predicted_wins = model.predict(X_current)

    results_df = pd.DataFrame({
        "TEAM_NAME": df["TEAM_NAME"],
        "Predicted_Wins": predicted_wins
    })

    # Round prediction to nearest integer
    results_df["Predicted_Wins"] = results_df["Predicted_Wins"].round().astype(int)
    results_df["Predicted_Losses"] = 82 - results_df["Predicted_Wins"]
    
    return results_df

In [149]:
N_GAMES = 20
train_df = generate_season_matrix("2022-23", N_GAMES)
X_train = train_df[FEATURES]
y_train = train_df["W"]

model = train_lr_model(X_train, y_train)

test_df = generate_season_matrix("2023-24", N_GAMES)
X_test = test_df[FEATURES]
y_test = test_df["W"]


y_pred = model.predict(X_test).round()

result = test_df[["TEAM_NAME", "W"]].copy()
result["Predicted_Wins"] = y_pred.astype(int)
result["Predicted_Losses"] = 82 - result["Predicted_Wins"]

r2 = model.score(X_test, y_test)

print(f"R2 on 2023-2024 data: {r2:.4f}")

plotly_plot_lr(result, actual_col="W", predicted_col="Predicted_Wins")

show_df = result.rename(columns={"W" : "Actual_Wins"}).sort_values("Actual_Wins", ascending=False)
print(show_df.to_string(index=False))


KeyError: "['TEAM_NAME'] not in index"