CS850 Final Project

In [1]:
import pandas as pd
from nba_api.stats.endpoints import LeagueDashTeamStats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px

# Function to pull team season data from the api
def get_nba_data(season='2022-23', season_type='Regular Season'):
    team_stats = LeagueDashTeamStats(
        season=season,
        season_type_all_star=season_type,
        per_mode_detailed='PerGame'
    )

    # Get data frame of team stats
    df = team_stats.get_data_frames()[0]
    return df

# Function to clean the data
def clean_data(df):
    float_cols = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "PLUS_MINUS"]
    # Ensure data is numerical
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Remove missing values
    df.dropna(subset=float_cols, inplace=True)
    return df

# Function to split the data into training and test sets
def split_data(df, target_col="W"):
    feature_cols = [
        "FG_PCT", 
        "FG3_PCT", 
        "FT_PCT", 
        "REB", 
        "AST", 
        "TOV", 
        "PLUS_MINUS"
    ]

    X = df[feature_cols]
    y = df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

# Function to fit a linear regression model to the training data
def train_lr_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Function to valuate the fit of the Linear Regression model
def evaluate_model_lr(model, X_test, y_test):
    score = model.score(X_test, y_test)
    print(f"R^2 Stat on test set: {score: .4f}")

# Generate final team record predictions from Linear Regression Model
def predict_final_record_lr(model, df):
    features_cols = [
        "FG_PCT", 
        "FG3_PCT", 
        "FT_PCT", 
        "REB", 
        "AST", 
        "TOV", 
        "PLUS_MINUS"
    ]
    X_current = df[features_cols]
    predicted_wins = model.predict(X_current)

    results_df = pd.DataFrame({
        "TEAM_NAME": df["TEAM_NAME"],
        "Predicted_Wins": predicted_wins
    })

    # Round prediction to nearest integer
    results_df["Predicted_Wins"] = results_df["Predicted_Wins"].round().astype(int)
    results_df["Predicted_Losses"] = 82 - results_df["Predicted_Wins"]
    
    return results_df

# Function to plot the Linear Regression model
def plot_lr_fit(y_actual, y_hat):
    plt.figure(figsize=(8,6))

    #Display actual vs predicted datapoints
    plt.scatter(y_actual, y_hat, alpha=0.7, label="Predicted Wins", color="blue")

    # Plot the LR
    min_val = min(y_actual.min(), y_hat.min())
    max_val = max(y_actual.max(), y_hat.max())
    plt.plot([min_val, max_val], [min_val, max_val], color="red", lw=2, label="Perfect Fit (y = x)")

    plt.xlabel("Actual Wins")
    plt.ylabel("Predicted Wins")
    plt.title("Actual Wins vs. Predicted Wins")
    plt.legend()
    plt.show()

# Plot with Plotly for interactivity
def plotly_plot_lr(df, actual_col="W", predicted_col="Predicted_Wins"):
    fig = px.scatter(
        df,
        x=actual_col,
        y=predicted_col,
        hover_data=["TEAM_NAME"],
        labels={actual_col: "Actual Wins", predicted_col: "Predicted Wins"},
        title="Actual Wins vs. Predicted Wins (Test Season: 2023-24)"
    )
    
    # Plot regression
    min_val = min(df[actual_col].min(), df[predicted_col].min())
    max_val = max(df[actual_col].max(), df[predicted_col].max())
    fig.add_shape(
        type="line",
        x0=min_val, y0=min_val, x1=max_val, y1=max_val,
        line=dict(color="red", dash="dash")
    )

    fig.show()
    



In [6]:
# Get the stats from training year and run training
nba_stats_2023 = get_nba_data(season="2022-23", season_type="Regular Season")
nba_stats_2023 = clean_data(nba_stats_2023)
feature_cols = ["FG_PCT", "FG3_PCT", "FT_PCT", "REB", "AST", "TOV", "PLUS_MINUS"]
X_train = nba_stats_2023[feature_cols]
y_train = nba_stats_2023["W"]
model_lr = train_lr_model(X_train, y_train)

# Get different season to predict on
nba_stats_2024 = get_nba_data(season="2023-24", season_type="Regular Season")
nba_stats_2024 = clean_data(nba_stats_2024)
# Get actual wins and features from test season
test_actual = nba_stats_2024[["TEAM_NAME", "W"]]
y_actual = nba_stats_2024["W"]
X_test = nba_stats_2024[feature_cols]
y_pred = model_lr.predict(X_test)
y_pred = [round(val) for val in y_pred]

# Generate a DF for plot
predictions_df = test_actual.copy()
predictions_df["Predicted_Wins"] = y_pred

# Generate Plotly plot
plotly_plot_lr(predictions_df)

# Evaluate Model
r2_score = model_lr.score(X_test, y_actual)
print(f"R^2 Score on 2023-24: {r2_score: .4f}")

# Print table of predictions vs actual win totals
predictions_table_lr = predictions_df[["TEAM_NAME", "W", "Predicted_Wins"]]
predictions_table_lr = predictions_table_lr.rename(columns={"W": "Actual Wins", "Predicted_Wins": "Predicted Wins"})
predictions_table_lr = predictions_table_lr.sort_values("Actual Wins", ascending=False)
print(predictions_table_lr)



R^2 Score on 2023-24:  0.9540
                 TEAM_NAME  Actual Wins  Predicted Wins
1           Boston Celtics           64              68
7           Denver Nuggets           57              55
20   Oklahoma City Thunder           57              59
17  Minnesota Timberwolves           56              58
12             LA Clippers           51              49
6         Dallas Mavericks           50              47
19         New York Knicks           50              52
16         Milwaukee Bucks           49              48
23            Phoenix Suns           49              49
18    New Orleans Pelicans           49              53
5      Cleveland Cavaliers           48              48
22      Philadelphia 76ers           47              47
13      Los Angeles Lakers           47              44
11          Indiana Pacers           47              50
21           Orlando Magic           47              45
25        Sacramento Kings           46              47
9    Golden State 