# CPEN 355 Final Project - NBA MVP Prediction 

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

## Loading and Exploring Data

In [3]:
df = pd.read_csv("./data/nba_team_stats_00_to_23.csv")
df.head()

Unnamed: 0,teamstatspk,Team,games_played,wins,losses,win_percentage,Min,points,field_goals_made,field_goals_attempted,...,rebounds,assists,turnovers,steals,blocks,blocks_attempted,personal_fouls,personal_fouls_drawn,plus_minus,season
0,0,Boston Celtics,82,64,18,0.78,3966,9887,3601,7396,...,3799,2207,979,557,538,304,1326,1416,930,2023-24
1,1,Denver Nuggets,82,57,25,0.695,3941,9418,3610,7279,...,3643,2415,1036,585,456,394,1489,1467,431,2023-24
2,2,Oklahoma City Thunder,82,57,25,0.695,3961,9847,3653,7324,...,3447,2223,1039,694,538,419,1545,1548,608,2023-24
3,3,Minnesota Timberwolves,82,56,26,0.683,3961,9264,3383,6974,...,3577,2184,1162,647,497,371,1544,1630,529,2023-24
4,4,LA Clippers,82,51,31,0.622,3941,9481,3473,7108,...,3523,2097,1078,640,413,384,1519,1537,269,2023-24


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 716 entries, 0 to 715
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   teamstatspk               716 non-null    int64  
 1   Team                      716 non-null    object 
 2   games_played              716 non-null    int64  
 3   wins                      716 non-null    int64  
 4   losses                    716 non-null    int64  
 5   win_percentage            716 non-null    float64
 6   Min                       716 non-null    int64  
 7   points                    716 non-null    int64  
 8   field_goals_made          716 non-null    int64  
 9   field_goals_attempted     716 non-null    int64  
 10  field_goal_percentage     716 non-null    float64
 11  three_pointers_made       716 non-null    int64  
 12  three_pointers_attempted  716 non-null    int64  
 13  three_point_percentage    716 non-null    float64
 14  free_throw

In [5]:
df.columns

Index(['teamstatspk', 'Team', 'games_played', 'wins', 'losses',
       'win_percentage', 'Min', 'points', 'field_goals_made',
       'field_goals_attempted', 'field_goal_percentage', 'three_pointers_made',
       'three_pointers_attempted', 'three_point_percentage',
       'free_throws_made', 'free_throw_attempted', 'free_throw_percentage',
       'offensive_rebounds', 'defensive_rebounds', 'rebounds', 'assists',
       'turnovers', 'steals', 'blocks', 'blocks_attempted', 'personal_fouls',
       'personal_fouls_drawn', 'plus_minus', 'season'],
      dtype='object')

## Splitting the Data

In [6]:
X = df.drop(columns=["wins"])
y = df["wins"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.head()

Unnamed: 0,teamstatspk,Team,games_played,losses,win_percentage,Min,points,field_goals_made,field_goals_attempted,field_goal_percentage,...,rebounds,assists,turnovers,steals,blocks,blocks_attempted,personal_fouls,personal_fouls_drawn,plus_minus,season
662,662,New Jersey Nets,82,30,0.634,3966,7889,3042,6816,44.6,...,3554,1990,1189,716,490,439,1734,9,341,2001-02
17,17,Sacramento Kings,82,36,0.561,3971,9558,3553,7455,47.7,...,3607,2324,1078,625,342,370,1632,1504,143,2023-24
266,266,Phoenix Suns,82,59,0.28,3941,8271,3051,7018,43.5,...,3676,1701,1410,632,313,449,1858,1770,-546,2015-16
318,318,New York Knicks,82,45,0.451,3971,8084,3027,6739,44.9,...,3307,1641,1063,631,367,277,1815,1611,-65,2013-14
364,364,Indiana Pacers,66,24,0.636,3198,6449,2354,5375,43.8,...,2896,1228,925,523,357,398,1430,1428,218,2011-12


In [7]:
y_train.head()

662    52
17     46
266    23
318    37
364    42
Name: wins, dtype: int64

## Preprocessing

In [8]:
numeric_features = ['games_played', 'Min', 'points', 'field_goals_made', 'field_goals_attempted', 'field_goal_percentage', 'three_pointers_made',
'three_pointers_attempted', 'three_point_percentage', 'free_throws_made', 'free_throw_attempted', 'free_throw_percentage', 'offensive_rebounds', 'defensive_rebounds', 'rebounds', 'assists', 'turnovers', 'steals', 'blocks', 'blocks_attempted', 'personal_fouls', 'personal_fouls_drawn', 'plus_minus']
categorical_features = ["Team", "season"]
drop_features = ["teamstatspk", "losses", "win_percentage"]
target_feature = "wins"

In [9]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = make_column_transformer((numeric_transformer, numeric_features), (categorical_transformer, categorical_features), ("drop", drop_features))
preprocessor

## Hyperparameter Optimization

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [11]:
# Dummy pipeline
dummy_pipeline = make_pipeline(preprocessor, DummyRegressor(strategy="mean"))

# LinearRegression pipeline
lr_pipeline = make_pipeline(preprocessor, LinearRegression())

# SVR pipeline
svr_pipeline = make_pipeline(preprocessor, SVR(kernel="rbf"))

# RandomForest pipeline
rf_pipeline = make_pipeline(preprocessor, RandomForestRegressor(random_state=0))

In [12]:
# Parameter grid for RandomForestRegressor
rf_param_grid = {
  'randomforestregressor__n_estimators': [100, 200, 500],
  'randomforestregressor__max_depth': [None, 5, 10, 20],
}

# Parameter grid for SVR
svr_param_grid = {
  'svr__C': [0.1, 1, 10, 100],
  'svr__gamma': ['scale', 0.001, 0.1, 1.0, 100]
}

In [13]:
def get_best_model_and_accuracy(model, param_grid, X_train, y_train):
  grid = GridSearchCV(model, param_grid, cv=3)
  grid.fit(X_train, y_train)

  print(f"Best accuracy: {grid.best_score_}")
  print(f"Best parameters: {grid.best_params_}")
  return grid.best_estimator_

In [14]:
best_svr_pipe = get_best_model_and_accuracy(svr_pipeline, svr_param_grid, X_train, y_train)

Best accuracy: 0.9347152659935395
Best parameters: {'svr__C': 100, 'svr__gamma': 0.001}


In [15]:
best_rf_pipe = get_best_model_and_accuracy(rf_pipeline, rf_param_grid, X_train, y_train)

Best accuracy: 0.9294080779660655
Best parameters: {'randomforestregressor__max_depth': 5, 'randomforestregressor__n_estimators': 500}


In [16]:
ohe_features = best_svr_pipe.named_steps["columntransformer"].named_transformers_["onehotencoder"].get_feature_names_out(categorical_features).tolist()

feature_names = (numeric_features + ohe_features)
feature_names[:10]

['games_played',
 'Min',
 'points',
 'field_goals_made',
 'field_goals_attempted',
 'field_goal_percentage',
 'three_pointers_made',
 'three_pointers_attempted',
 'three_point_percentage',
 'free_throws_made']

In [17]:
data = {
  "Importance": best_rf_pipe.named_steps["randomforestregressor"].feature_importances_,
}
rf_imp_df = pd.DataFrame(
  data=data,
  index=feature_names,
).sort_values(by="Importance", ascending=False)

rf_imp_df.head(10)

Unnamed: 0,Importance
plus_minus,0.953433
Min,0.009407
rebounds,0.004078
personal_fouls,0.003557
defensive_rebounds,0.003472
field_goals_attempted,0.003447
games_played,0.003338
turnovers,0.002849
field_goals_made,0.001824
offensive_rebounds,0.0017


## Model Training

In [18]:
from sklearn.metrics import root_mean_squared_error, r2_score

In [19]:
mse_scores = []
r2_scores = []

In [20]:
def model_training_and_evaluation(pipe, X_train, y_train, X_test, y_test):
  pipe.fit(X_train, y_train)
  
  y_pred = pipe.predict(X_test)
  mse = root_mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  mse_scores.append(mse)
  r2_scores.append(r2)
  print("MSE: ", mse)
  print("R2: ", r2)
  return pipe, mse, r2

In [21]:
models = [("DummyRegressor", dummy_pipeline), ("LinearRegressor", lr_pipeline), ("SVR", best_svr_pipe), ("RandomForestRegressor", best_rf_pipe)]

for name, model in models:
  print(name)
  model_training_and_evaluation(model, X_train, y_train, X_test, y_test)

DummyRegressor
MSE:  12.037692439969176
R2:  -0.050291877671502005
LinearRegressor
MSE:  3.2470727770708265
R2:  0.9235799062446571
SVR
MSE:  3.204213337301492
R2:  0.9255839920596556
RandomForestRegressor
MSE:  3.648253984244698
R2:  0.9035296949617656


In [22]:
df_scores = pd.DataFrame({"MSE": mse_scores, "R2": r2_scores}, index=[name for name, _ in models])
df_scores

Unnamed: 0,MSE,R2
DummyRegressor,12.037692,-0.050292
LinearRegressor,3.247073,0.92358
SVR,3.204213,0.925584
RandomForestRegressor,3.648254,0.90353


In [23]:
results = pd.DataFrame({"team": X_test["Team"], "season": X_test["season"], "actual": y_test, "predicted": y_pred})
results.head(10)

NameError: name 'y_pred' is not defined