### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Add player positions
- Bias weighting by position

### Import data

In [57]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed

In [58]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'tov', 'blk', 'pts'
]
player_names_header = ['player_id', 'player_name', 'season_id']

In [59]:
# Read in per-season stats and player information
career_stats_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_career_stats.csv'
player_info_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_info.csv'

df_stats = pd.read_csv(career_stats_csv, header=0)
df_player_info = pd.read_csv(player_info_csv, header=0)

In [60]:
# Survey data
# df_stats
# df_player_info

### Scrub data

In [61]:
# Convert all headers to lowercase
df_stats.columns = df_stats.columns.str.lower()
df_player_info.columns = df_player_info.columns.str.lower()

# Rename the person_id column in df_player_info to player_id
df_player_info.rename(columns={'person_id': 'player_id'}, inplace=True)

# Define a mapping for the positions
position_mapping = {
    'Guard': 'Guard',
    'Forward': 'Forward',
    'Center': 'Center',
    'Guard-Forward': 'Guard',
    'Forward-Guard': 'Forward',
    'Forward-Center': 'Forward',
    'Center-Forward': 'Center'
}

# Apply the mapping to the position column
df_player_info['position'] = df_player_info['position'].map(position_mapping)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
df_stats = df_stats[df_stats['gp'] >= 10]

# Remove any seasons before 1978, when they began tracking turnovers
# Ensure season_id is in a comparable format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: int(x.split('-')[0]))

# Filter the DataFrame to keep only seasons from 1973-74 onwards
df_stats = df_stats[df_stats['season_id'] >= 1978]

# If you want to convert the season_id back to the original format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: f"{x}-{str(x+1)[-2:]}")

# Convert each stat to per game by dividing by games played
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat] = df_stats[stat] / df_stats['gp']

# Add player names to stats dataframe
df_stats = pd.merge(
    df_player_info[['player_id', 'display_first_last', 'position']]
    , df_stats
    , on=['player_id']
    , how='outer'
).drop_duplicates()

df_stats.reset_index(drop=True, inplace=True)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)
df_stats.dropna(subset=['season_id'], inplace=True)

df_stats


Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts
0,2,Byron Scott,Guard,1983-84,0.0,1.610613e+09,LAL,23.0,74.0,49.0,...,0.806,0.675676,1.540541,2.216216,2.391892,1.094595,0.256757,1.567568,174.0,10.648649
1,2,Byron Scott,Guard,1984-85,0.0,1.610613e+09,LAL,24.0,81.0,65.0,...,0.820,0.703704,1.888889,2.592593,3.012346,1.234568,0.209877,1.703704,197.0,15.987654
2,2,Byron Scott,Guard,1985-86,0.0,1.610613e+09,LAL,25.0,76.0,62.0,...,0.784,0.723684,1.763158,2.486842,2.157895,1.118421,0.197368,1.447368,167.0,15.447368
3,2,Byron Scott,Guard,1986-87,0.0,1.610613e+09,LAL,26.0,82.0,82.0,...,0.892,0.768293,2.719512,3.487805,3.426829,1.524390,0.219512,1.756098,163.0,17.036585
4,2,Byron Scott,Guard,1987-88,0.0,1.610613e+09,LAL,27.0,81.0,81.0,...,0.858,0.938272,3.172840,4.111111,4.135802,1.913580,0.333333,1.987654,204.0,21.654321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23515,1642377,Jaylen Wells,Forward,2024-25,0.0,1.610613e+09,MEM,21.0,35.0,30.0,...,0.807,1.057143,2.200000,3.257143,1.685714,0.485714,0.142857,1.171429,64.0,11.800000
23524,1642402,Enrique Freeman,Forward,2024-25,0.0,1.610613e+09,IND,24.0,17.0,1.0,...,0.556,0.235294,0.588235,0.823529,0.352941,0.117647,0.117647,0.235294,15.0,1.529412
23525,1642403,Isaac Jones,Forward,2024-25,0.0,1.610613e+09,SAC,24.0,21.0,0.0,...,0.550,0.476190,1.000000,1.476190,0.190476,0.095238,0.380952,0.523810,22.0,3.619048
23527,1642419,Jamison Battle,Forward,2024-25,0.0,1.610613e+09,TOR,23.0,31.0,0.0,...,0.857,0.612903,1.290323,1.903226,0.967742,0.129032,0.129032,0.387097,36.0,6.000000


### Normalize data

We need to normalize our data. The modern NBA features more attempts, and higher scoring totals due to the introduction and the three-point line and changes in strategy that have led to its wider adoption. Additionally, certain positions will have higher statistics than others (more rebounds for C/F, more assists for G).

We're going to:
- Calculate the mean and standard deviation for each statistic across all seasons
- For each statistic, subtract the mean and divide by the standard deviation

In [62]:
# Calculate mean and standard deviation for each statistic across all seasons
means = df_stats[stats].mean()
stds = df_stats[stats].std()

# Apply Z-score normalization
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat + '_zscore'] = (df_stats[stat] - means[stat]) / stds[stat]

df_stats

Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ftm_zscore,fta_zscore,oreb_zscore,dreb_zscore,reb_zscore,ast_zscore,stl_zscore,tov_zscore,blk_zscore,pts_zscore
0,2,Byron Scott,Guard,1983-84,0.0,1.610613e+09,LAL,23.0,74.0,49.0,...,-0.076413,-0.157383,-0.449193,-0.637958,-0.607393,0.201887,0.869230,0.330960,-0.357310,0.302382
1,2,Byron Scott,Guard,1984-85,0.0,1.610613e+09,LAL,24.0,81.0,65.0,...,0.490159,0.373464,-0.416234,-0.442043,-0.456627,0.534524,1.175823,0.498340,-0.452048,1.195060
2,2,Byron Scott,Guard,1985-86,0.0,1.610613e+09,LAL,25.0,76.0,62.0,...,0.138975,0.090576,-0.392738,-0.512755,-0.498988,0.076437,0.921418,0.183175,-0.477325,1.104725
3,2,Byron Scott,Guard,1986-87,0.0,1.610613e+09,LAL,26.0,82.0,82.0,...,0.791616,0.513007,-0.340281,0.025109,-0.098030,0.756736,1.810642,0.562758,-0.432576,1.370441
4,2,Byron Scott,Guard,1987-88,0.0,1.610613e+09,LAL,27.0,81.0,81.0,...,1.237901,0.996332,-0.140396,0.280065,0.151649,1.136830,2.663113,0.847457,-0.202561,2.142523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23515,1642377,Jaylen Wells,Forward,2024-25,0.0,1.610613e+09,MEM,21.0,35.0,30.0,...,-0.218373,-0.298994,-0.000611,-0.267071,-0.190427,-0.176708,-0.464445,-0.156093,-0.587483,0.494887
23524,1642402,Enrique Freeman,Forward,2024-25,0.0,1.610613e+09,IND,24.0,17.0,1.0,...,-0.945298,-0.922085,-0.967054,-1.173544,-1.165264,-0.891233,-1.270649,-1.307072,-0.638429,-1.222349
23525,1642403,Isaac Jones,Forward,2024-25,0.0,1.610613e+09,SAC,24.0,21.0,0.0,...,-0.781630,-0.682312,-0.683775,-0.941963,-0.903827,-0.978333,-1.319733,-0.952342,-0.106331,-0.872963
23527,1642419,Jamison Battle,Forward,2024-25,0.0,1.610613e+09,TOR,23.0,31.0,0.0,...,-0.879045,-0.966188,-0.523009,-0.778683,-0.732768,-0.561626,-1.245711,-1.120430,-0.615421,-0.474870


### Combining Z-scores into a single score
To determine the best season of all time, we will can combine the Z-scores of different statistics into a single score. We will can use a weighted sum of the Z-scores, where the weights reflect the importance of each statistic.

In [72]:
# Define weights for each statistic
stats_weights = {
    'fgm_zscore': 0.1,
    'fga_zscore': 0.05,
    'fg3m_zscore': 0.1,
    'fg3a_zscore': 0.05,
    'ftm_zscore': 0.05,
    'fta_zscore': 0.05,
    'oreb_zscore': 0.05,
    'dreb_zscore': 0.1,
    'ast_zscore': 0.1,
    'stl_zscore': 0.1,
    'tov_zscore': -0.05,  # Negative weight for turnovers
    'blk_zscore': 0.1,
    'pts_zscore': 0.15
}

# Calculate the combined score
df_stats['combined_score'] = sum(df_stats[stat] * weight for stat, weight in stats_weights.items())

# Determine the best season of all time
best_season = df_stats.loc[df_stats['combined_score'].idxmax()]

# Sort the DataFrame by combined score
df_stats.sort_values(by='combined_score', ascending=False, inplace=True)

df_stats.head(25)

Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,fta_zscore,oreb_zscore,dreb_zscore,reb_zscore,ast_zscore,stl_zscore,tov_zscore,blk_zscore,pts_zscore,combined_score
17172,201935,James Harden,Guard,2018-19,0.0,1610613000.0,HOU,29.0,78.0,78.0,...,5.01348,-0.248721,1.754724,1.165062,2.947313,2.908567,4.503852,0.626502,4.562548,3.262626
17173,201935,James Harden,Guard,2019-20,0.0,1610613000.0,HOU,30.0,68.0,68.0,...,5.446977,-0.033221,1.605426,1.132134,2.956208,2.498079,3.972551,0.90692,4.263266,3.116665
21958,1629029,Luka Dončić,Forward,2023-24,0.0,1610613000.0,DAL,25.0,70.0,70.0,...,3.701558,-0.252598,3.219876,2.207283,4.173513,1.569472,3.356767,0.220853,4.182828,3.029286
16621,201566,Russell Westbrook,Guard,2016-17,0.0,1610613000.0,OKC,28.0,81.0,81.0,...,4.656556,0.745186,3.543435,2.777621,4.479299,2.041155,5.052048,-0.102767,3.802132,2.755781
20159,203999,Nikola Jokić,Center,2024-25,0.0,1610613000.0,DEN,29.0,30.0,30.0,...,2.462521,2.832838,3.95101,3.779051,4.119901,2.195292,2.460988,0.268968,3.710688,2.744281
20071,203954,Joel Embiid,Center,2023-24,0.0,1610613000.0,PHI,30.0,39.0,39.0,...,5.347794,1.620718,3.32659,2.921416,1.930062,1.055176,3.132485,2.54371,4.322467,2.733637
23349,1641705,Victor Wembanyama,Forward,2024-25,0.0,1610613000.0,SAS,21.0,30.0,30.0,...,1.272073,0.872942,3.501081,2.790974,0.974671,0.735045,2.788855,6.937745,2.818958,2.687695
4190,893,Michael Jordan,Guard,1986-87,0.0,1610613000.0,CHI,24.0,82.0,82.0,...,5.497402,1.136811,0.306314,0.605414,1.384388,4.775657,2.481979,2.204376,4.722585,2.683725
21957,1629029,Luka Dončić,Forward,2022-23,0.0,1610613000.0,DAL,24.0,66.0,66.0,...,4.73863,-0.281614,2.884134,1.958269,3.216622,1.458531,2.800032,0.134246,3.938181,2.666128
3935,764,David Robinson,Center,1993-94,0.0,1610613000.0,SAN,28.0,80.0,80.0,...,5.33235,2.298766,2.812128,2.785967,1.472815,2.277431,2.291932,5.817862,3.502386,2.656225
