### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Watch this video
- Add player names
- Add player positions
- Bias weighting by position

### Import data

In [72]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [73]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'ast', 'stl', 'tov', 'blk', 'pts']
bias_factors = ['season_id', 'position']
player_names_header = ['player_id', 'player_name', 'season_id']

In [74]:
# Read in per-season stats and player information
stats_csv = 'df_player_career_stats.csv'
player_name_csv = 'df_player_info.csv'

df_stats = pd.read_csv(stats_csv, header=0)
df_player_info = pd.read_csv(player_name_csv, header=0)

### Scrub data

In [75]:
# Convert all headers to lowercase
df_stats.columns = df_stats.columns.str.lower()
df_player_info.columns = df_player_info.columns.str.lower()

# Rename the person_id column in df_player_info to player_id
df_player_info.rename(columns={'person_id': 'player_id'}, inplace=True)

# Define a mapping for the positions
position_mapping = {
    'Guard': 'Guard',
    'Forward': 'Forward',
    'Center': 'Center',
    'Guard-Forward': 'Guard',
    'Forward-Guard': 'Forward',
    'Forward-Center': 'Forward',
    'Center-Forward': 'Center'
}

# Apply the mapping to the position column
df_player_info['position'] = df_player_info['position'].map(position_mapping)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
df_stats = df_stats[df_stats['gp'] >= 10]

# Remove any seasons before 1979, when the three point shot was introduced
# Ensure season_id is in a comparable format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: int(x.split('-')[0]))

# Filter the DataFrame to keep only seasons from 1979-1980 onwards
df_stats = df_stats[df_stats['season_id'] >= 1979]

# If you want to convert the season_id back to the original format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: f"{x}-{str(x+1)[-2:]}")

# Convert each stat to per game by dividing by games played
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat] = df_stats[stat] / df_stats['gp']

# Add player names to stats dataframe
df_stats = pd.merge(
    df_player_info[['player_id', 'display_first_last', 'position']]
    , df_stats
    , on=['player_id']
    , how='outer'
).drop_duplicates()

df_stats.reset_index(drop=True, inplace=True)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)
df_stats.dropna(subset=['season_id'], inplace=True)

df_stats


Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts
0,2,Byron Scott,Guard,1983-84,0.0,1.610613e+09,LAL,23.0,74.0,49.0,...,0.806,0.675676,1.540541,164.0,2.391892,1.094595,0.256757,1.567568,174.0,10.648649
1,2,Byron Scott,Guard,1984-85,0.0,1.610613e+09,LAL,24.0,81.0,65.0,...,0.820,0.703704,1.888889,210.0,3.012346,1.234568,0.209877,1.703704,197.0,15.987654
2,2,Byron Scott,Guard,1985-86,0.0,1.610613e+09,LAL,25.0,76.0,62.0,...,0.784,0.723684,1.763158,189.0,2.157895,1.118421,0.197368,1.447368,167.0,15.447368
3,2,Byron Scott,Guard,1986-87,0.0,1.610613e+09,LAL,26.0,82.0,82.0,...,0.892,0.768293,2.719512,286.0,3.426829,1.524390,0.219512,1.756098,163.0,17.036585
4,2,Byron Scott,Guard,1987-88,0.0,1.610613e+09,LAL,27.0,81.0,81.0,...,0.858,0.938272,3.172840,333.0,4.135802,1.913580,0.333333,1.987654,204.0,21.654321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23245,1642377,Jaylen Wells,Forward,2024-25,0.0,1.610613e+09,MEM,21.0,35.0,30.0,...,0.807,1.057143,2.200000,114.0,1.685714,0.485714,0.142857,1.171429,64.0,11.800000
23254,1642402,Enrique Freeman,Forward,2024-25,0.0,1.610613e+09,IND,24.0,17.0,1.0,...,0.556,0.235294,0.588235,14.0,0.352941,0.117647,0.117647,0.235294,15.0,1.529412
23255,1642403,Isaac Jones,Forward,2024-25,0.0,1.610613e+09,SAC,24.0,21.0,0.0,...,0.550,0.476190,1.000000,31.0,0.190476,0.095238,0.380952,0.523810,22.0,3.619048
23257,1642419,Jamison Battle,Forward,2024-25,0.0,1.610613e+09,TOR,23.0,31.0,0.0,...,0.857,0.612903,1.290323,59.0,0.967742,0.129032,0.129032,0.387097,36.0,6.000000


### Normalize data

We need to normalize our data. The modern NBA features more attempts, and higher scoring totals due to the introduction and the three-point line and changes in strategy that have led to its wider adoption. Additionally, certain positions will have higher statistics than others (more rebounds for C/F, more assists for G).

We're going to:
- Calculate the mean and standard deviation for each statistic across all seasons
- For each statistic, subtract the mean and divide by the standard deviation

In [76]:
# Create a grouped dataframe based on my bias_factors with the mean, and std for each stat

df_bias_factors = df_stats.groupby(bias_factors)[stats].agg(['mean', 'std']).reset_index(inplace=False)
df_bias_factors.columns = ['season_id', 'position'] + ['_'.join(col).strip() for col in df_bias_factors.columns.values[2:]]
df_bias_factors

Unnamed: 0,season_id,position,min_mean,min_std,fgm_mean,fgm_std,fga_mean,fga_std,fg3m_mean,fg3m_std,...,ast_mean,ast_std,stl_mean,stl_std,tov_mean,tov_std,blk_mean,blk_std,pts_mean,pts_std
0,1979-80,Center,24.133794,8.703277,4.028446,2.424825,8.082661,4.453865,0.005618,0.013447,...,1.918527,1.143833,0.681484,0.351239,1.865660,0.864340,1.051571,0.715975,10.071623,6.222841
1,1979-80,Forward,23.144746,8.857601,4.202905,2.424266,8.694114,4.763910,0.042683,0.120805,...,1.886227,1.275407,0.800935,0.472037,1.788946,0.915677,0.534893,0.468456,10.542161,6.003370
2,1979-80,Guard,22.355048,8.917220,4.035315,2.384479,8.772053,4.810455,0.133864,0.200062,...,3.134136,1.881132,1.007293,0.602034,1.766599,0.845442,0.204371,0.181464,10.104354,6.033310
3,1980-81,Center,22.826354,8.903524,3.707132,2.346863,7.463014,4.368307,0.002723,0.006899,...,1.664401,1.045011,0.581173,0.287800,1.661969,0.803098,0.962958,0.759261,9.376271,6.126573
4,1980-81,Forward,22.394301,9.617331,4.029023,2.485139,8.358143,4.793612,0.018548,0.045414,...,1.768859,1.299705,0.800741,0.502792,1.668122,0.847100,0.524230,0.479337,10.088625,6.155697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,2023-24,Forward,19.444041,9.253820,3.251339,2.385509,6.818229,4.703230,0.937185,0.772549,...,1.684807,1.492798,0.581218,0.356753,0.978749,0.750288,0.449476,0.420432,8.749694,6.472535
134,2023-24,Guard,20.650540,9.185211,3.346332,2.300027,7.606168,4.904283,1.271671,0.899909,...,2.741346,2.022411,0.705938,0.385671,1.104862,0.771685,0.271657,0.218255,9.270839,6.493031
135,2024-25,Center,20.540214,9.074854,3.618999,2.440687,6.685312,4.558715,0.490822,0.689403,...,1.714036,1.532076,0.595965,0.358500,1.208329,0.763131,0.934000,0.552075,9.261514,6.406823
136,2024-25,Forward,21.122832,9.368980,3.572164,2.466014,7.617257,5.008177,1.139423,0.871707,...,1.856709,1.597286,0.694113,0.410069,1.143139,0.851411,0.489904,0.451084,9.777612,6.821015


In [77]:
# Merge the bias factors back into the original dataframe
df_stats = pd.merge(
    df_stats
    , df_bias_factors
    , on=bias_factors
    , how='left'
)

In [78]:
# Normalize each statistic by subtracting the mean and dividing by the standard deviation as calculated by bias factors
for stat in stats:
    df_stats[stat + '_zscore'] = (df_stats[stat] - df_stats[stat + '_mean']) / df_stats[stat + '_std']

In [79]:
# Drop the merged yearly stats columns
df_stats.drop(columns=[col for col in df_stats.columns if '_mean' in col or '_std' in col], inplace=True)

### Combining Z-scores into a single score
To determine the best season of all time, we will can combine the Z-scores of different statistics into a single score. We will can use a weighted sum of the Z-scores, where the weights reflect the importance of each statistic.

In [80]:
# Define weights for each statistic
stats_weights = {
    'fgm_zscore': 0.1,
    'fga_zscore': 0.05,
    'fg3m_zscore': 0.1,
    'fg3a_zscore': 0.05,
    'ftm_zscore': 0.05,
    'fta_zscore': 0.05,
    'oreb_zscore': 0.05,
    'dreb_zscore': 0.1,
    'ast_zscore': 0.1,
    'stl_zscore': 0.1,
    'tov_zscore': -0.05,  # Negative weight for turnovers
    'blk_zscore': 0.1,
    'pts_zscore': 0.15
}

# Calculate the combined score
df_stats['combined_score'] = sum(df_stats[stat] * weight for stat, weight in stats_weights.items())

# Assign a simple number that ranks the seasons
df_stats['rank'] = df_stats['combined_score'].rank(ascending=False)

# Drop any field with zscore in the name
df_stats.drop(columns=[col for col in df_stats.columns if 'zscore' in col], inplace=True)

# Drop stats I don't need anymore
df_stats.drop(columns=['league_id', 'team_id', 'min', 'reb', 'pf'], inplace=True)

# Sort by combined score
df_stats.sort_values(by='rank', ascending=True, inplace=True)

# Display the top 25 seasons
df_stats.head(25)

Unnamed: 0,player_id,display_first_last,position,season_id,team_abbreviation,player_age,gp,gs,fgm,fga,...,ft_pct,oreb,dreb,ast,stl,blk,tov,pts,combined_score,rank
15398,201935,James Harden,Guard,2018-19,HOU,29.0,78.0,78.0,10.807692,24.474359,...,0.879,0.846154,5.794872,7.512821,2.025641,0.74359,4.961538,36.128205,3.679343,1.0
15399,201935,James Harden,Guard,2019-20,HOU,30.0,68.0,68.0,9.882353,22.264706,...,0.865,1.029412,5.529412,7.529412,1.838235,0.882353,4.529412,34.338235,3.231965,2.0
4183,893,Michael Jordan,Guard,1987-88,CHI,25.0,82.0,82.0,13.036585,24.365854,...,0.841,1.695122,3.780488,5.914634,3.158537,1.597561,3.073171,34.97561,3.185964,3.0
14853,201566,Russell Westbrook,Guard,2016-17,OKC,28.0,81.0,81.0,10.17284,23.962963,...,0.845,1.691358,8.975309,10.37037,1.62963,0.382716,5.407407,31.580247,3.159766,4.0
4182,893,Michael Jordan,Guard,1986-87,CHI,24.0,82.0,82.0,13.390244,27.792683,...,0.857,2.02439,3.219512,4.597561,2.878049,1.52439,3.317073,37.085366,3.029032,5.0
20057,1629029,Luka Dončić,Forward,2023-24,DAL,25.0,70.0,70.0,11.485714,23.6,...,0.786,0.842857,8.4,9.8,1.414286,0.542857,4.028571,33.857143,3.027615,6.0
15397,201935,James Harden,Guard,2017-18,HOU,28.0,72.0,72.0,9.041667,20.125,...,0.858,0.569444,4.833333,8.75,1.75,0.694444,4.375,30.430556,2.937679,7.0
15979,202326,DeMarcus Cousins,Center,2015-16,SAC,25.0,65.0,65.0,9.246154,20.492308,...,0.718,2.430769,9.061538,3.292308,1.553846,1.415385,3.830769,26.892308,2.936409,8.0
5525,1449,Larry Bird,Forward,1984-85,BOS,28.0,80.0,77.0,11.475,22.0,...,0.882,2.05,8.475,6.6375,1.6125,1.225,3.1,28.6875,2.91683,9.0
15396,201935,James Harden,Guard,2016-17,HOU,27.0,81.0,81.0,8.320988,18.925926,...,0.847,1.17284,6.962963,11.197531,1.493827,0.469136,5.728395,29.08642,2.900106,10.0


In [82]:
search = df_stats[df_stats['display_first_last'] == 'Giannis Antetokounmpo']
search

Unnamed: 0,player_id,display_first_last,position,season_id,team_abbreviation,player_age,gp,gs,fgm,fga,...,ft_pct,oreb,dreb,ast,stl,blk,tov,pts,combined_score,rank
17774,203507,Giannis Antetokounmpo,Forward,2021-22,MIL,27.0,67.0,67.0,10.283582,18.58209,...,0.722,2.0,9.61194,5.791045,1.074627,1.358209,3.268657,29.880597,2.571072,38.0
17771,203507,Giannis Antetokounmpo,Forward,2018-19,MIL,24.0,72.0,72.0,10.013889,17.319444,...,0.729,2.208333,10.263889,5.888889,1.277778,1.527778,3.722222,27.694444,2.455786,67.0
17772,203507,Giannis Antetokounmpo,Forward,2019-20,MIL,25.0,63.0,63.0,10.873016,19.650794,...,0.633,2.222222,11.365079,5.619048,0.968254,1.047619,3.650794,29.47619,2.438506,69.0
17773,203507,Giannis Antetokounmpo,Forward,2020-21,MIL,26.0,61.0,61.0,10.262295,18.032787,...,0.685,1.590164,9.409836,5.852459,1.180328,1.196721,3.393443,28.147541,2.32736,102.0
17776,203507,Giannis Antetokounmpo,Forward,2023-24,MIL,29.0,73.0,73.0,11.465753,18.753425,...,0.657,2.684932,8.835616,6.520548,1.191781,1.082192,3.424658,30.438356,2.325535,104.0
17770,203507,Giannis Antetokounmpo,Forward,2017-18,MIL,23.0,75.0,75.0,9.893333,18.693333,...,0.76,2.08,7.96,4.813333,1.453333,1.413333,2.973333,26.853333,2.324949,105.0
17769,203507,Giannis Antetokounmpo,Forward,2016-17,MIL,22.0,80.0,80.0,8.2,15.7375,...,0.77,1.775,6.975,5.425,1.6375,1.8875,2.925,22.9,2.221558,140.0
17775,203507,Giannis Antetokounmpo,Forward,2022-23,MIL,28.0,63.0,63.0,11.222222,20.285714,...,0.645,2.174603,9.603175,5.698413,0.825397,0.809524,3.904762,31.095238,2.181796,156.0
17777,203507,Giannis Antetokounmpo,Forward,2024-25,MIL,30.0,26.0,26.0,12.692308,20.961538,...,0.603,2.038462,9.615385,6.0,0.769231,1.5,3.346154,32.384615,2.056154,231.0
17768,203507,Giannis Antetokounmpo,Forward,2015-16,MIL,21.0,80.0,79.0,6.4125,12.6625,...,0.724,1.4125,6.2375,4.3125,1.175,1.4125,2.6,16.875,1.408727,1072.0
