In [None]:
# Only use TOT rows for players that moved teams

In [None]:
# Resources
# https://www.basketball-reference.com/leagues/NBA_2020_per_game.html
# https://www.basketball-reference.com/leagues/NBA_2020_advanced.html

For the first iteration, we will drop players based off games played (GP)/minutes per game (MPG). In the future, we should not drop these players because GP/MPG can have edge cases where a player dramatically changes their GP/MPG for different reasons that we currently do not track e.g. new coach.

In [None]:
RANDOM_STATE = 30  # night night

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load Data

In [None]:
# Merge per-game and advanced-stats together
per_game_data = pd.read_csv('data/2020_2021_all_players_per_game_stats.csv')
advanced_stats_data = pd.read_csv('data/2020_2021_all_players_advanced_stats.csv')

# Fix colliding features that represent different things
# e.g.  minutes played (MP) in per-game data is essentially minutes played per game,
#       opposed to advanced-stats representing total minutes played in the season
# Applicable features: MP
colliding_features = ["MP"]
for col in colliding_features:
    per_game_data = per_game_data.rename(columns={col: f"{col}_pg"})
    advanced_stats_data = advanced_stats_data.rename(columns={col: f"{col}_as"})

# Merge
uniq_cols = list(per_game_data.columns.difference(advanced_stats_data.columns))
uniq_cols.append("Player-additional")  # add merge-on target
merged = pd.merge(per_game_data[uniq_cols], advanced_stats_data, on="Player-additional", how="outer")
print(merged.columns)
print(merged.head())

# Features that will be used to fit the model
# cols = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
#       'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
#       'STL', 'TOV', 'TRB', 'eFG%', 'Player-additional', 'Rk', 'Player', 'Pos',
#       'Age', 'Tm', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
#       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Unnamed: 19', 'OWS',
#       'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM', 'DBPM', 'BPM', 'VORP']
# Removed Player-additional, Rk, Player, Tm, and other null/empty cols
cols_to_use = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
               'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
               'STL', 'TOV', 'TRB', 'eFG%', 'Pos',
               'Age', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
               'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
               'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

# TODO: drop players based off MP/GP threshold

# Template for ML Pipeline

In [None]:
# Create dataframe containing only the columns/features to use
...

# Separate features and target variable
X = data.drop('target_column_name', axis=1)  # Modify 'target_column_name' to your target column
y = data['target_column_name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical cols
...

# Feature scaling (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost regressor
xgb = XGBRegressor(objective='reg:squarederror')  # You can adjust hyperparameters here

# Fit the model on the training data
xgb.fit(X_train_scaled, y_train)

# Make predictions on the test set
predictions = xgb.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Cross-validation
# Define the model and perform cross-validation
model = XGBRegressor(objective='reg:squarederror')
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive values
cv_scores = -cv_scores

# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f"Mean Cross-Validation Score (MSE): {mean_cv_score}")
print(f"Standard Deviation of Cross-Validation Score (MSE): {std_cv_score}")
