In [1]:
# Only use TOT rows for players that moved teams

In [2]:
# Resources
# https://www.basketball-reference.com/leagues/NBA_2020_per_game.html
# https://www.basketball-reference.com/leagues/NBA_2020_advanced.html

For the first iteration, we will drop players based off games played (GP)/minutes per game (MPG). In the future, we should not drop these players because GP/MPG can have edge cases where a player dramatically changes their GP/MPG for different reasons that we currently do not track e.g. new coach.

In [3]:
RANDOM_STATE = 30  # night night

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [5]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load Data

In [6]:
# Merge per-game and advanced-stats together
per_game_data = pd.read_csv('data/per_game_stats/2020_2021_all_players_per_game_stats.csv')
advanced_stats_data = pd.read_csv('data//advanced_stats/2020_2021_all_players_advanced_stats.csv')

# Remove extra rows of traded players
Players that get traded mid-season will have X+2 extra rows, where X is the number of times they were traded  
Example: Traded once = 3 total rows (i.e. 2 extra rows)

In [7]:
# For players that got traded, use only rows where their team == "TOT" i.e. total
# 1. Get players (Player-additional) that have Tm == "TOT"
traded_players = per_game_data[per_game_data["Tm"] == "TOT"]["Player-additional"]

# 2. For each player found, remove rows where Tm != "TOT"
per_game_data = per_game_data[~((per_game_data['Player-additional'].isin(traded_players)) & (per_game_data['Tm'] != "TOT"))]
advanced_stats_data = advanced_stats_data[~((advanced_stats_data['Player-additional'].isin(traded_players)) & (advanced_stats_data['Tm'] != "TOT"))]

In [8]:
# Fix colliding features that represent different things
# e.g.  minutes played (MP) in per-game data is essentially minutes played per game,
#       opposed to advanced-stats representing total minutes played in the season
# Applicable features: MP
colliding_features = ["MP"]
for col in colliding_features:
    per_game_data = per_game_data.rename(columns={col: f"{col}_pg"})
    advanced_stats_data = advanced_stats_data.rename(columns={col: f"{col}_as"})

In [9]:
# Merge
uniq_cols = list(per_game_data.columns.difference(advanced_stats_data.columns))
uniq_cols.append("Player-additional")  # add merge-on target
merged = pd.merge(per_game_data[uniq_cols], advanced_stats_data, on="Player-additional", how="outer")

In [10]:
# Features that will be used to fit the model
# cols = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
#       'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
#       'STL', 'TOV', 'TRB', 'eFG%', 'Player-additional', 'Rk', 'Player', 'Pos',
#       'Age', 'Tm', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
#       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Unnamed: 19', 'OWS',
#       'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM', 'DBPM', 'BPM', 'VORP']
# Removed Player-additional, Rk, Player, Tm, and other null/empty cols
cols_to_use = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
               'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
               'STL', 'TOV', 'TRB', 'eFG%', 'Pos',
               'Age', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
               'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
               'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', "Player-additional"]
merged = merged[cols_to_use]

# Create target column

In [11]:
# 1. load next season's per_game
# 2. Calculate fan points for each row
# 3. Merge into previous season's df
next_season_per_game = pd.read_csv("data/per_game_stats/2021_2022_all_players_per_game_stats.csv")

# For players that got traded, use only rows where their team == "TOT" i.e. total
# Get players (Player-additional) that have Tm == "TOT"
traded_players = next_season_per_game[next_season_per_game["Tm"] == "TOT"]["Player-additional"]

# For each player found, remove rows where Tm != "TOT"
next_season_per_game = next_season_per_game[~((next_season_per_game['Player-additional'].isin(traded_players)) & (next_season_per_game['Tm'] != "TOT"))]

# Create target column
fan_score_weights = {
    "point": 1.0,
    "rebound": 1.2,
    "assist": 1.5,
    "steal": 3.0,
    "block": 3.0,
    "turnover": -1.0,
}
next_season_per_game['avg_fantasy_points_next_season'] = (
    (next_season_per_game['PTS'] * fan_score_weights["point"])
    + (next_season_per_game['TRB'] * fan_score_weights["rebound"])
    + (next_season_per_game['AST'] * fan_score_weights["assist"])
    + (next_season_per_game['STL'] * fan_score_weights["steal"])
    + (next_season_per_game['BLK'] * fan_score_weights["block"])
    + (next_season_per_game['TOV'] * fan_score_weights["turnover"])
)
next_season_per_game = next_season_per_game[["Player-additional", "avg_fantasy_points_next_season"]]

In [12]:
merged = merged.merge(next_season_per_game, how="inner", on="Player-additional")

In [14]:
merged.sort_values(by="avg_fantasy_points_next_season", ascending=False).head()

Unnamed: 0,2P,2P%,2PA,3P,3P%,3PA,AST,BLK,DRB,FG,FG%,FGA,FT,FT%,FTA,GS,MP_pg,ORB,PF,PTS,STL,TOV,TRB,eFG%,Pos,Age,G,MP_as,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Player-additional,avg_fantasy_points_next_season
227,8.9,0.606,14.7,1.3,0.388,3.3,8.3,0.7,8.0,10.2,0.566,18.0,4.8,0.868,5.5,72,34.6,2.8,2.7,26.4,1.3,3.1,10.8,0.602,C,25,72,2488,31.3,0.647,0.183,0.305,9.4,26.1,17.8,40.4,1.9,1.9,13.1,29.6,12.2,3.4,15.6,0.301,9.1,3.0,12.1,8.8,jokicni01,58.91
8,9.2,0.636,14.4,1.1,0.303,3.6,5.9,1.2,9.4,10.3,0.569,18.0,6.5,0.685,9.5,61,33.0,1.6,2.8,28.1,1.2,3.4,11.0,0.6,PF,26,61,2013,29.2,0.633,0.201,0.528,5.3,28.9,17.5,28.7,1.7,3.2,13.2,32.5,6.9,3.3,10.2,0.244,6.2,2.8,9.0,5.6,antetgi01,56.72
125,7.9,0.541,14.6,1.1,0.377,3.0,2.8,1.4,8.4,9.0,0.513,17.6,9.2,0.859,10.7,51,31.1,2.2,2.4,28.5,1.0,3.1,10.6,0.545,C,26,51,1585,30.3,0.636,0.171,0.61,8.0,29.1,18.7,16.2,1.5,3.9,12.2,35.3,5.6,3.2,8.8,0.266,6.3,1.2,7.5,3.8,embiijo01,55.64
111,6.9,0.567,12.2,2.9,0.35,8.3,8.6,0.5,7.2,9.8,0.479,20.5,5.2,0.73,7.1,66,34.3,0.8,2.3,27.7,1.0,4.3,8.0,0.55,PG,21,66,2262,25.3,0.587,0.406,0.349,2.7,22.9,12.8,44.1,1.4,1.5,15.3,36.0,5.1,2.6,7.7,0.163,6.1,0.7,6.8,5.1,doncilu01,53.27
217,7.1,0.591,12.0,2.3,0.365,6.3,7.8,0.6,7.0,9.4,0.513,18.3,4.0,0.698,5.7,45,33.4,0.6,1.6,25.0,1.1,3.7,7.7,0.576,PG,36,45,1504,24.2,0.602,0.346,0.31,2.2,23.6,12.9,41.8,1.6,1.5,15.2,31.9,3.0,2.6,5.6,0.179,5.9,2.3,8.1,3.8,jamesle01,53.14


In [15]:
# Data Quality Check
has_dupes = merged["Player-additional"].duplicated().any()
if has_dupes:
    raise RuntimeError(
        "There are duplicate rows for a player. This likely means a player has multiple rows because they got traded, but our logic did not catch them"
    )

# Template for ML Pipeline

In [None]:
# Create dataframe containing only the columns/features to use
...

# Separate features and target variable
X = data.drop('target_column_name', axis=1)  # Modify 'target_column_name' to your target column
y = data['target_column_name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical cols
...

# Feature scaling (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost regressor
xgb = XGBRegressor(objective='reg:squarederror')  # You can adjust hyperparameters here

# Fit the model on the training data
xgb.fit(X_train_scaled, y_train)

# Make predictions on the test set
predictions = xgb.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Cross-validation
# Define the model and perform cross-validation
model = XGBRegressor(objective='reg:squarederror')
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive values
cv_scores = -cv_scores

# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f"Mean Cross-Validation Score (MSE): {mean_cv_score}")
print(f"Standard Deviation of Cross-Validation Score (MSE): {std_cv_score}")
