CS850 Final Project

In [None]:
import pandas as pd
from nba_api.stats.static import teams as static_teams
from nba_api.stats.endpoints import LeagueDashTeamStats, TeamGameLog,
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px

DEFAULT_FEATURES = [
    "FG_PCT", 
    "FG3_PCT",
    "FT_PCT",
    "REB",
    "AST",
    "TOV",
    "FG3M",
    "DEF_RATING",
    "OFF_RATING"
]

# Function to pull team season data from the api
def get_nba_data(season='2022-23', season_type='Regular Season'):
    team_stats = LeagueDashTeamStats(
        season=season,
        season_type_all_star=season_type,
        per_mode_detailed='PerGame'
    ).get_data_frames()[0]
        
    team_stats_adv = LeagueDashTeamStats(
        season=season,
        season_type_all_star=season_type,
        per_mode_detailed="PerGame",
        measure_type_detailed_defense="Advanced"
    ).get_data_frames()[0][["TEAM_ID", "DEF_RATING", "OFF_RATING"]]
    

    # Get data frame of team stats
    df = team_stats.merge(team_stats_adv, on="TEAM_ID", how="left")
    return df

# Function to clean the data
def clean_data(df):
    #float_cols = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "PLUS_MINUS"]
    float_cols = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "FG3M", "DEF_RATING", "OFF_RATING"]
    # Ensure data is numerical
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Remove missing values
    df.dropna(subset=float_cols, inplace=True)
    return df

# Function to split the data into training and test sets
def split_data(df, feature_cols, target_col="W"):
    X = df[feature_cols]
    y = df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

# Function to fit a linear regression model to the training data
def train_lr_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Function to valuate the fit of the Linear Regression model
def evaluate_model_lr(model, X_test, y_test):
    score = model.score(X_test, y_test)
    print(f"R^2 Stat on test set: {score: .4f}")

# Generate final team record predictions from Linear Regression Model
def predict_final_record_lr(model, df, feature_cols):
    features_cols = [
        "FG_PCT", 
        "FG3_PCT", 
        "FT_PCT", 
        "REB", 
        "AST", 
        "TOV", 
        "FG3M",
        "DEF_RATING",
        "OFF_RATING"
    ]
    X_current = df[features_cols]
    predicted_wins = model.predict(X_current)

    results_df = pd.DataFrame({
        "TEAM_NAME": df["TEAM_NAME"],
        "Predicted_Wins": predicted_wins
    })

    # Round prediction to nearest integer
    results_df["Predicted_Wins"] = results_df["Predicted_Wins"].round().astype(int)
    results_df["Predicted_Losses"] = 82 - results_df["Predicted_Wins"]
    
    return results_df

# Function to plot the Linear Regression model
def plot_lr_fit(y_actual, y_hat):
    plt.figure(figsize=(8,6))

    #Display actual vs predicted datapoints
    plt.scatter(y_actual, y_hat, alpha=0.7, label="Predicted Wins", color="blue")

    # Plot the LR
    min_val = min(y_actual.min(), y_hat.min())
    max_val = max(y_actual.max(), y_hat.max())
    plt.plot([min_val, max_val], [min_val, max_val], color="red", lw=2, label="Perfect Fit (y = x)")

    plt.xlabel("Actual Wins")
    plt.ylabel("Predicted Wins")
    plt.title("Actual Wins vs. Predicted Wins")
    plt.legend()
    plt.show()

# Plot with Plotly for interactivity
def plotly_plot_lr(df, actual_col="W", predicted_col="Predicted_Wins"):
    fig = px.scatter(
        df,
        x=actual_col,
        y=predicted_col,
        hover_data=["TEAM_NAME"],
        labels={actual_col: "Actual Wins", predicted_col: "Predicted Wins"},
        title="Actual Wins vs. Predicted Wins (Test Season: 2023-24)"
    )
    
    # Plot regression
    min_val = min(df[actual_col].min(), df[predicted_col].min())
    max_val = max(df[actual_col].max(), df[predicted_col].max())
    fig.add_shape(
        type="line",
        x0=min_val, y0=min_val, x1=max_val, y1=max_val,
        line=dict(color="red", dash="dash")
    )

    fig.show()
    



In [23]:
# Get the stats from training year and run training
nba_stats_2023 = get_nba_data(season="2022-23", season_type="Regular Season")
nba_stats_2023 = clean_data(nba_stats_2023)
#feature_cols = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "PLUS_MINUS"]
feature_cols = DEFAULT_FEATURES
X_train = nba_stats_2023[feature_cols]
y_train = nba_stats_2023["W"]
model_lr = train_lr_model(X_train, y_train)

# Get different season to predict on
nba_stats_2024 = get_nba_data(season="2023-24", season_type="Regular Season")
nba_stats_2024 = clean_data(nba_stats_2024)
# Get actual wins and features from test season
test_actual = nba_stats_2024[["TEAM_NAME", "W"]]
y_actual = nba_stats_2024["W"]
X_test = nba_stats_2024[feature_cols]
y_pred = model_lr.predict(X_test)
y_pred = [round(val) for val in y_pred]

# Generate a DF for plot
predictions_df = test_actual.copy()
predictions_df["Predicted_Wins"] = y_pred

# Generate Plotly plot
plotly_plot_lr(predictions_df)

# Evaluate Model
r2_score = model_lr.score(X_test, y_actual)
print(f"R^2 Score on 2023-24: {r2_score: .4f}")
print(f"Features Used: {feature_cols}")

# Print table of predictions vs actual win totals
predictions_table_lr = predictions_df[["TEAM_NAME", "W", "Predicted_Wins"]]
predictions_table_lr = predictions_table_lr.rename(columns={"W": "Actual Wins", "Predicted_Wins": "Predicted Wins"})
predictions_table_lr = predictions_table_lr.sort_values("Actual Wins", ascending=False)
print(predictions_table_lr)



R^2 Score on 2023-24:  0.9442
Features Used: ['FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'TOV', 'FG3M', 'DEF_RATING', 'OFF_RATING']
                 TEAM_NAME  Actual Wins  Predicted Wins
1           Boston Celtics           64              70
7           Denver Nuggets           57              55
20   Oklahoma City Thunder           57              58
17  Minnesota Timberwolves           56              56
12             LA Clippers           51              48
19         New York Knicks           50              52
6         Dallas Mavericks           50              47
18    New Orleans Pelicans           49              53
23            Phoenix Suns           49              49
16         Milwaukee Bucks           49              49
5      Cleveland Cavaliers           48              48
22      Philadelphia 76ers           47              47
11          Indiana Pacers           47              49
13      Los Angeles Lakers           47              44
21           Orlando Magic

Plus-Minus increases R^2 score from 0.6579 to 0.9532.

By adding Offensive Rating and Defensive Rating (advanced stats measuring the efficiency of a team's offense and defense), I was able to get the R^2 score back up to an impressive 0.9442 without giving the model Plus/Minus.

Now I will train on the first n games of a season and predict on the rest.

In [24]:
TRAIN_GAMES = 20
BASE_COLS = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "FG3M"]
ADV_COLS = ["DEF_RATING", "OFF_RATING"]


def get_stats(season, n_games=TRAIN_GAMES):
    # Get the stats from every game from every team
    df_standard, df_adv = [], []
    for team in static_teams.get_teams():
        team_id = team["id"]
        team_name = team["full_name"]
        
        
        game_logs_standard = TeamGameLog(team_id=team_id,
                                         season=season,
                                         season_type_all_star="Regular Season").get_data_frames()[0].sort_values("GAME_DATE").head(n_games)
        standard_row = game_logs_standard[BASE_COLS].mean().to_dict()
        standard_row.update({"TEAM_ID" : team_id, "TEAM_NAME" : team_name})
        df_standard.append(
            pd.DataFrame([standard_row])
        )

        # Pull just the training games we want

        # Get advanced stats from same training data
        game_logs_adv= TeamGameLog(
            team_id=team_id, season=season, season_type_all_star="Regular Season",
            measure_type_detailed="Advanced"
        ).get_data_frames()[0].sort_values("GAME_DATE").head(n_games)

        adv_row = game_logs_adv[ADV_COLS].mean().to_dict()
        adv_row.update({"TEAM_ID" : team_id})
        df_adv.append(
            pd.DataFrame([adv_row])
        )
        
    df_standard = pd.concat(df_standard, ignore_index=True)
    df_adv = pd.concat(df_adv, ignore_index=True)
    df = df_standard.merge(df_adv, on="TEAM_ID", how="left")
    
    return df


Use same LR methods on this data

In [25]:
train_data_2023 = get_stats("2022-23")
train_data_2023 = clean_data(train_data_2023)

X_train = train_data_2023[DEFAULT_FEATURES]
# y_train and y_actual are the same from previous work
model_lr = train_lr_model(X_train, y_train)

test_data_2024 = get_stats("2023-24")
test_data_2024 = clean_data(test_data_2024)

X_test = test_data_2024[DEFAULT_FEATURES]
y_pred = model_lr.predict(X_test).round()

r2 = model_lr.score(X_test, y_actual)
print(f"R^2 using stats after {TRAIN_GAMES} games: {r2:.3f}")

TypeError: TeamGameLog.__init__() got an unexpected keyword argument 'measure_type_detailed'