### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Watch this video
- Add player names
- Add player positions
- Bias weighting by position

### Import data

In [23]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [24]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'ast', 'stl', 'tov', 'blk', 'pts']
bias_factors = ['season_id', 'position']
player_names_header = ['player_id', 'player_name', 'season_id']

In [25]:
# Read in per-season stats and player information
stats_csv = 'df_player_career_stats.csv'
player_name_csv = 'df_player_info.csv'

df_stats = pd.read_csv(stats_csv, header=0)
df_player_info = pd.read_csv(player_name_csv, header=0)

### Scrub data

In [26]:
# Convert all headers to lowercase
df_stats.columns = df_stats.columns.str.lower()
df_player_info.columns = df_player_info.columns.str.lower()

# Rename the person_id column in df_player_info to player_id
df_player_info.rename(columns={'person_id': 'player_id'}, inplace=True)

# Define a mapping for the positions
position_mapping = {
    'Guard': 'Guard',
    'Forward': 'Forward',
    'Center': 'Center',
    'Guard-Forward': 'Guard',
    'Forward-Guard': 'Forward',
    'Forward-Center': 'Forward',
    'Center-Forward': 'Center'
}

# Apply the mapping to the position column
df_player_info['position'] = df_player_info['position'].map(position_mapping)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
df_stats = df_stats[df_stats['gp'] >= 10]

# Remove any seasons before 1979, when the three point shot was introduced
# Ensure season_id is in a comparable format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: int(x.split('-')[0]))

# Filter the DataFrame to keep only seasons from 1979-1980 onwards
df_stats = df_stats[df_stats['season_id'] >= 1979]

# If you want to convert the season_id back to the original format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: f"{x}-{str(x+1)[-2:]}")

# Convert each stat to per game by dividing by games played
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat] = df_stats[stat] / df_stats['gp']

# Add player names to stats dataframe
df_stats = pd.merge(
    df_player_info[['player_id', 'display_first_last', 'position']]
    , df_stats
    , on=['player_id']
    , how='outer'
).drop_duplicates()

df_stats.reset_index(drop=True, inplace=True)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)
df_stats.dropna(subset=['season_id'], inplace=True)

df_stats


Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts
0,2,Byron Scott,Guard,1983-84,0.0,1.610613e+09,LAL,23.0,74.0,49.0,...,0.806,0.675676,1.540541,164.0,2.391892,1.094595,0.256757,1.567568,174.0,10.648649
1,2,Byron Scott,Guard,1984-85,0.0,1.610613e+09,LAL,24.0,81.0,65.0,...,0.820,0.703704,1.888889,210.0,3.012346,1.234568,0.209877,1.703704,197.0,15.987654
2,2,Byron Scott,Guard,1985-86,0.0,1.610613e+09,LAL,25.0,76.0,62.0,...,0.784,0.723684,1.763158,189.0,2.157895,1.118421,0.197368,1.447368,167.0,15.447368
3,2,Byron Scott,Guard,1986-87,0.0,1.610613e+09,LAL,26.0,82.0,82.0,...,0.892,0.768293,2.719512,286.0,3.426829,1.524390,0.219512,1.756098,163.0,17.036585
4,2,Byron Scott,Guard,1987-88,0.0,1.610613e+09,LAL,27.0,81.0,81.0,...,0.858,0.938272,3.172840,333.0,4.135802,1.913580,0.333333,1.987654,204.0,21.654321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23245,1642377,Jaylen Wells,Forward,2024-25,0.0,1.610613e+09,MEM,21.0,35.0,30.0,...,0.807,1.057143,2.200000,114.0,1.685714,0.485714,0.142857,1.171429,64.0,11.800000
23254,1642402,Enrique Freeman,Forward,2024-25,0.0,1.610613e+09,IND,24.0,17.0,1.0,...,0.556,0.235294,0.588235,14.0,0.352941,0.117647,0.117647,0.235294,15.0,1.529412
23255,1642403,Isaac Jones,Forward,2024-25,0.0,1.610613e+09,SAC,24.0,21.0,0.0,...,0.550,0.476190,1.000000,31.0,0.190476,0.095238,0.380952,0.523810,22.0,3.619048
23257,1642419,Jamison Battle,Forward,2024-25,0.0,1.610613e+09,TOR,23.0,31.0,0.0,...,0.857,0.612903,1.290323,59.0,0.967742,0.129032,0.129032,0.387097,36.0,6.000000


### Normalize data

We need to normalize our data. The modern NBA features more attempts, and higher scoring totals due to the introduction and the three-point line and changes in strategy that have led to its wider adoption. Additionally, certain positions will have higher statistics than others (more rebounds for C/F, more assists for G).

We're going to:
- Calculate the mean and standard deviation for each statistic across all seasons
- For each statistic, subtract the mean and divide by the standard deviation

In [27]:
# Create a grouped dataframe based on my bias_factors with the mean, and std for each stat

df_bias_factors = df_stats.groupby(bias_factors)[stats].agg(['mean', 'std']).reset_index(inplace=False)
df_bias_factors.columns = ['season_id', 'position'] + ['_'.join(col).strip() for col in df_bias_factors.columns.values[2:]]
df_bias_factors

Unnamed: 0,season_id,position,min_mean,min_std,fgm_mean,fgm_std,fga_mean,fga_std,fg3m_mean,fg3m_std,...,ast_mean,ast_std,stl_mean,stl_std,tov_mean,tov_std,blk_mean,blk_std,pts_mean,pts_std
0,1979-80,Center,24.133794,8.703277,4.028446,2.424825,8.082661,4.453865,0.005618,0.013447,...,1.918527,1.143833,0.681484,0.351239,1.865660,0.864340,1.051571,0.715975,10.071623,6.222841
1,1979-80,Forward,23.144746,8.857601,4.202905,2.424266,8.694114,4.763910,0.042683,0.120805,...,1.886227,1.275407,0.800935,0.472037,1.788946,0.915677,0.534893,0.468456,10.542161,6.003370
2,1979-80,Guard,22.355048,8.917220,4.035315,2.384479,8.772053,4.810455,0.133864,0.200062,...,3.134136,1.881132,1.007293,0.602034,1.766599,0.845442,0.204371,0.181464,10.104354,6.033310
3,1980-81,Center,22.826354,8.903524,3.707132,2.346863,7.463014,4.368307,0.002723,0.006899,...,1.664401,1.045011,0.581173,0.287800,1.661969,0.803098,0.962958,0.759261,9.376271,6.126573
4,1980-81,Forward,22.394301,9.617331,4.029023,2.485139,8.358143,4.793612,0.018548,0.045414,...,1.768859,1.299705,0.800741,0.502792,1.668122,0.847100,0.524230,0.479337,10.088625,6.155697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,2023-24,Forward,19.444041,9.253820,3.251339,2.385509,6.818229,4.703230,0.937185,0.772549,...,1.684807,1.492798,0.581218,0.356753,0.978749,0.750288,0.449476,0.420432,8.749694,6.472535
134,2023-24,Guard,20.650540,9.185211,3.346332,2.300027,7.606168,4.904283,1.271671,0.899909,...,2.741346,2.022411,0.705938,0.385671,1.104862,0.771685,0.271657,0.218255,9.270839,6.493031
135,2024-25,Center,20.540214,9.074854,3.618999,2.440687,6.685312,4.558715,0.490822,0.689403,...,1.714036,1.532076,0.595965,0.358500,1.208329,0.763131,0.934000,0.552075,9.261514,6.406823
136,2024-25,Forward,21.122832,9.368980,3.572164,2.466014,7.617257,5.008177,1.139423,0.871707,...,1.856709,1.597286,0.694113,0.410069,1.143139,0.851411,0.489904,0.451084,9.777612,6.821015


In [28]:
# Merge the bias factors back into the original dataframe
df_stats = pd.merge(
    df_stats
    , df_bias_factors
    , on=bias_factors
    , how='left'
)

In [29]:
# Normalize each statistic by subtracting the mean and dividing by the standard deviation as calculated by bias factors
for stat in stats:
    df_stats[stat + '_zscore'] = (df_stats[stat] - df_stats[stat + '_mean']) / df_stats[stat + '_std']

In [30]:
# Drop the merged yearly stats columns
df_stats.drop(columns=[col for col in df_stats.columns if '_mean' in col or '_std' in col], inplace=True)

### Combining Z-scores into a single score
To determine the best season of all time, we will can combine the Z-scores of different statistics into a single score. We will can use a weighted sum of the Z-scores, where the weights reflect the importance of each statistic.

In [32]:
# Define weights for each statistic
stats_weights = {
    'fgm_zscore': 0.1,
    'fga_zscore': 0.05,
    'fg3m_zscore': 0.1,
    'fg3a_zscore': 0.05,
    'ftm_zscore': 0.05,
    'fta_zscore': 0.05,
    'oreb_zscore': 0.05,
    'dreb_zscore': 0.1,
    'ast_zscore': 0.1,
    'stl_zscore': 0.1,
    'tov_zscore': -0.05,  # Negative weight for turnovers
    'blk_zscore': 0.1,
    'pts_zscore': 0.15
}

# Calculate the combined score
df_stats['combined_score'] = sum(df_stats[stat] * weight for stat, weight in stats_weights.items())

# Sort by combined score
df_stats.sort_values(by='combined_score', ascending=False, inplace=True)

# Display the top 25 seasons
df_stats.head(25)

Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ftm_zscore,fta_zscore,oreb_zscore,dreb_zscore,ast_zscore,stl_zscore,tov_zscore,blk_zscore,pts_zscore,combined_score
15398,201935,James Harden,Guard,2018-19,0.0,1610613000.0,HOU,29.0,78.0,78.0,...,6.936627,6.470833,1.249963,3.206174,2.593935,3.396823,4.725702,2.868629,4.744115,3.679343
15399,201935,James Harden,Guard,2019-20,0.0,1610613000.0,HOU,30.0,68.0,68.0,...,5.922595,5.75582,1.672856,2.760877,2.499029,2.804255,3.699688,3.634794,3.85892,3.231965
4183,893,Michael Jordan,Guard,1987-88,0.0,1610613000.0,CHI,25.0,82.0,82.0,...,5.359997,5.254407,2.225058,2.469582,0.94626,3.680146,2.036162,7.047687,4.515207,3.185964
14853,201566,Russell Westbrook,Guard,2016-17,0.0,1610613000.0,OKC,28.0,81.0,81.0,...,4.883303,4.922504,4.275388,6.208627,3.873416,2.382866,5.04668,1.018929,3.743747,3.159766
4182,893,Michael Jordan,Guard,1986-87,0.0,1610613000.0,CHI,24.0,82.0,82.0,...,5.667432,5.393566,2.404568,1.941629,0.406022,3.107091,2.064027,6.339595,4.540809,3.029032
20057,1629029,Luka Dončić,Forward,2023-24,0.0,1610613000.0,DAL,25.0,70.0,70.0,...,4.198425,4.146499,-0.204721,3.147175,5.43623,2.335138,4.06487,0.222108,3.879075,3.027615
15397,201935,James Harden,Guard,2017-18,0.0,1610613000.0,HOU,28.0,72.0,72.0,...,5.906539,5.709743,0.401558,2.432738,3.23284,2.382435,3.687264,2.388165,3.702623,2.937679
15979,202326,DeMarcus Cousins,Center,2015-16,0.0,1610613000.0,SAC,25.0,65.0,65.0,...,4.982641,4.346384,0.678132,2.146599,2.494988,2.997326,4.050633,0.813479,3.631713,2.936409
5525,1449,Larry Bird,Forward,1984-85,0.0,1610613000.0,BOS,28.0,80.0,77.0,...,1.762386,1.443614,0.66907,3.290706,3.530924,2.591646,2.000861,1.780279,2.619068,2.91683
15396,201935,James Harden,Guard,2016-17,0.0,1610613000.0,HOU,27.0,81.0,81.0,...,5.181731,5.214733,2.543483,4.395808,4.284446,2.029235,5.437178,1.520598,3.326461,2.900106
