In [None]:
# Only use TOT rows for players that moved teams

In [None]:
# Resources
# https://www.basketball-reference.com/leagues/NBA_2020_per_game.html
# https://www.basketball-reference.com/leagues/NBA_2020_advanced.html

For the first iteration, we will drop players based off games played (GP)/minutes per game (MPG). In the future, we should not drop these players because GP/MPG can have edge cases where a player dramatically changes their GP/MPG for different reasons that we currently do not track e.g. new coach.

In [None]:
RANDOM_STATE = 30  # night night

In [None]:
# Import necessary libraries
import math

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Load Data

In [None]:
# Merge per-game and advanced-stats together
per_game_data = pd.read_csv('data/per_game_stats/2020_2021_all_players_per_game_stats.csv')
advanced_stats_data = pd.read_csv('data//advanced_stats/2020_2021_all_players_advanced_stats.csv')

# Remove extra rows of traded players
Players that get traded mid-season will have X+2 extra rows, where X is the number of times they were traded  
Example: Traded once = 3 total rows (i.e. 2 extra rows)

In [None]:
# For players that got traded, use only rows where their team == "TOT" i.e. total
# 1. Get players (Player-additional) that have Tm == "TOT"
traded_players = per_game_data[per_game_data["Tm"] == "TOT"]["Player-additional"]

# 2. For each player found, remove rows where Tm != "TOT"
per_game_data = per_game_data[~((per_game_data['Player-additional'].isin(traded_players)) & (per_game_data['Tm'] != "TOT"))]
advanced_stats_data = advanced_stats_data[~((advanced_stats_data['Player-additional'].isin(traded_players)) & (advanced_stats_data['Tm'] != "TOT"))]

In [None]:
# Fix colliding features that represent different things
# e.g.  minutes played (MP) in per-game data is essentially minutes played per game,
#       opposed to advanced-stats representing total minutes played in the season
# Applicable features: MP
colliding_features = ["MP"]
for col in colliding_features:
    per_game_data = per_game_data.rename(columns={col: f"{col}_pg"})
    advanced_stats_data = advanced_stats_data.rename(columns={col: f"{col}_as"})

In [None]:
# Merge
uniq_cols = list(per_game_data.columns.difference(advanced_stats_data.columns))
uniq_cols.append("Player-additional")  # add merge-on target
merged = pd.merge(per_game_data[uniq_cols], advanced_stats_data, on="Player-additional", how="outer")

In [None]:
# Features that will be used to fit the model
# cols = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
#       'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
#       'STL', 'TOV', 'TRB', 'eFG%', 'Player-additional', 'Rk', 'Player', 'Pos',
#       'Age', 'Tm', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
#       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Unnamed: 19', 'OWS',
#       'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM', 'DBPM', 'BPM', 'VORP']
# Removed Player-additional, Rk, Player, Tm, and other null/empty cols
cols_to_use = ['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
               'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GS', 'MP_pg', 'ORB', 'PF', 'PTS',
               'STL', 'TOV', 'TRB', 'eFG%', 'Pos',
               'Age', 'G', 'MP_as', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
               'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
               'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', "Player-additional"]
merged = merged[cols_to_use]

# Create target column

In [None]:
# 1. load next season's per_game
# 2. Calculate fan points for each row
# 3. Merge into previous season's df
next_season_per_game = pd.read_csv("data/per_game_stats/2021_2022_all_players_per_game_stats.csv")

# For players that got traded, use only rows where their team == "TOT" i.e. total
# Get players (Player-additional) that have Tm == "TOT"
traded_players = next_season_per_game[next_season_per_game["Tm"] == "TOT"]["Player-additional"]

# For each player found, remove rows where Tm != "TOT"
next_season_per_game = next_season_per_game[~((next_season_per_game['Player-additional'].isin(traded_players)) & (next_season_per_game['Tm'] != "TOT"))]

# Create target column
fan_score_weights = {
    "point": 1.0,
    "rebound": 1.2,
    "assist": 1.5,
    "steal": 3.0,
    "block": 3.0,
    "turnover": -1.0,
}
next_season_per_game['avg_fantasy_points_next_season'] = (
    (next_season_per_game['PTS'] * fan_score_weights["point"])
    + (next_season_per_game['TRB'] * fan_score_weights["rebound"])
    + (next_season_per_game['AST'] * fan_score_weights["assist"])
    + (next_season_per_game['STL'] * fan_score_weights["steal"])
    + (next_season_per_game['BLK'] * fan_score_weights["block"])
    + (next_season_per_game['TOV'] * fan_score_weights["turnover"])
)
next_season_per_game = next_season_per_game[["Player-additional", "avg_fantasy_points_next_season"]]

In [None]:
merged.columns

In [None]:
merged = merged.merge(next_season_per_game, how="inner", on=["Player-additional"])

In [None]:
merged.sort_values(by="avg_fantasy_points_next_season", ascending=False).head()

In [None]:
# Data Quality Check
has_dupes = merged["Player-additional"].duplicated().any()
if has_dupes:
    raise RuntimeError(
        "There are duplicate rows for a player. This likely means a player has multiple rows because they got traded, but our logic did not catch them"
    )

# Template for ML Pipeline

In [None]:
# Deciding which features are categorical
# Select non-numeric (categorical) features
non_numeric_features = merged.select_dtypes(include=['object', 'category'])
features = list(non_numeric_features.columns)
print(
    f"There are {len(list(non_numeric_features.columns))} features with object|category dtype\n"
    f"{'\n'.join(list(non_numeric_features.columns))}"
)


Pos (i.e. Position) will definitely be OHE (One-Hot Encoded). I will also try a target encoding.  
Player-additional will be dropped before training.

In [None]:
merged.to_csv("merged.csv")

In [None]:
# Create dataframe containing only the columns/features to use
data = merged.copy()

# Separate features and target variable
X = data.drop(['avg_fantasy_points_next_season', "Player-additional", "Pos"], axis=1)  # Modify 'target_column_name' to your target column
y = data['avg_fantasy_points_next_season']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled

In [None]:
# Initialize XGBoost regressor
xgb = XGBRegressor(objective='reg:squarederror', random_state=RANDOM_STATE)  # You can adjust hyperparameters here


In [None]:
# Fit the model on the training data
xgb.fit(X_train_scaled, y_train)

# Make predictions on the test set
predictions = xgb.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Cross-validation
# Define the model and perform cross-validation
model = XGBRegressor(objective='reg:squarederror')
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive values
cv_scores = -cv_scores

# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f"Mean Cross-Validation Score (MSE): {mean_cv_score}")
print(f"Standard Deviation of Cross-Validation Score (MSE): {std_cv_score}")


In [None]:
# i.e. The average difference between actual and predicted
print(math.sqrt(mse))

In [None]:
# Assuming y_true contains the actual target values and y_pred contains the predicted values
# Create a DataFrame to store actual and predicted values side by side
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Print the DataFrame
print(comparison_df)
