### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Add player positions
- Bias weighting by position

### Import data

In [30]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed

In [31]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'tov', 'blk', 'pts'
]
player_names_header = ['player_id', 'player_name', 'season_id']

In [32]:
# Read in per-season stats and player information
career_stats_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_career_stats.csv'
player_info_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_info.csv'

df_stats = pd.read_csv(career_stats_csv, header=0)
df_player_info = pd.read_csv(player_info_csv, header=0)

In [33]:
# Survey data
# df_stats
# df_player_info

['Forward-Guard' 'Guard' 'Forward' 'Center' 'Center-Forward' nan
 'Forward-Center' 'Guard-Forward']


### Scrub data

In [34]:
# Convert all headers to lowercase
df_stats.columns = df_stats.columns.str.lower()
df_player_info.columns = df_player_info.columns.str.lower()

# Rename the person_id column in df_player_info to player_id
df_player_info.rename(columns={'person_id': 'player_id'}, inplace=True)

# Define a mapping for the positions
position_mapping = {
    'Guard': 'Guard',
    'Forward': 'Forward',
    'Center': 'Center',
    'Guard-Forward': 'Guard',
    'Forward-Guard': 'Forward',
    'Forward-Center': 'Forward',
    'Center-Forward': 'Center'
}

# Apply the mapping to the position column
df_player_info['position'] = df_player_info['position'].map(position_mapping)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
stats_df = df_stats[df_stats['gp'] >= 10]

# Remove any seasons before 1974, when they began tracking orebs, drebs, steals, and blocks
# Ensure season_id is in a comparable format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: int(x.split('-')[0]))

# Filter the DataFrame to keep only seasons from 1973-74 onwards
df_stats = df_stats[df_stats['season_id'] >= 1973]

# If you want to convert the season_id back to the original format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: f"{x}-{str(x+1)[-2:]}")

# Convert each stat to per game by dividing by games played
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat] = df_stats[stat] / df_stats['gp']

# Add player names to stats dataframe
df_stats = pd.merge(
    df_player_info[['player_id', 'display_first_last', 'position']]
    , df_stats
    , on=['player_id']
    , how='outer'
).drop_duplicates()

# Drop null value rows
df_stats.dropna(how='all', inplace=True)
df_stats.dropna(subset=['season_id'], inplace=True)

df_stats


Unnamed: 0,player_id,display_first_last,position,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts
0,2,Byron Scott,Guard,1983-84,0.0,1.610613e+09,LAL,23.0,74.0,49.0,...,0.806,0.675676,1.540541,2.216216,2.391892,1.094595,0.256757,1.567568,174.0,10.648649
1,2,Byron Scott,Guard,1984-85,0.0,1.610613e+09,LAL,24.0,81.0,65.0,...,0.820,0.703704,1.888889,2.592593,3.012346,1.234568,0.209877,1.703704,197.0,15.987654
2,2,Byron Scott,Guard,1985-86,0.0,1.610613e+09,LAL,25.0,76.0,62.0,...,0.784,0.723684,1.763158,2.486842,2.157895,1.118421,0.197368,1.447368,167.0,15.447368
3,2,Byron Scott,Guard,1986-87,0.0,1.610613e+09,LAL,26.0,82.0,82.0,...,0.892,0.768293,2.719512,3.487805,3.426829,1.524390,0.219512,1.756098,163.0,17.036585
4,2,Byron Scott,Guard,1987-88,0.0,1.610613e+09,LAL,27.0,81.0,81.0,...,0.858,0.938272,3.172840,4.111111,4.135802,1.913580,0.333333,1.987654,204.0,21.654321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27311,1642450,Daniss Jenkins,Guard,2024-25,0.0,1.610613e+09,DET,23.0,2.0,0.0,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000
27312,1642461,Spencer Jones,Forward,2024-25,0.0,1.610613e+09,DEN,23.0,5.0,0.0,...,0.000,0.000000,0.600000,0.600000,0.000000,0.400000,0.000000,0.200000,1.0,0.400000
27316,1642502,Malevy Leons,Forward,2024-25,0.0,1.610613e+09,OKC,25.0,6.0,0.0,...,0.500,0.166667,0.333333,0.500000,0.166667,0.000000,0.000000,0.166667,4.0,0.333333
27318,1642505,Alex Ducas,Guard,2024-25,0.0,1.610613e+09,OKC,24.0,4.0,0.0,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


### Normalize data

We need to normalize our data. The modern NBA features more attempts, and higher scoring totals due to the introduction and the three-point line and changes in strategy that have led to its wider adoption. Additionally, certain positions will have higher statistics than others (more rebounds for C/F, more assists for G).

We're going to:
- Determine the min / max value for each statistic for each position and year combination
- Normalize each player season statistic against those min / max values

In [36]:
# Create our calculations for normalizing statistics across an entire dataframe

def normalize_col(col):
    normalized_input = (col - col.min()) / (col.max() - col.min())
    return normalized_input

def normalize_df(df):
    for col in stats:
        if col in df.columns:
            df['{}_norm'.format(col)] = normalize_col(df[col])
        else:
            print(f"Column '{col}' not found in DataFrame")
    return df

In [37]:
# Apply our new normalize_df function to the stats_df dataframe,
stats_df = normalize_df(stats_df).reset_index(drop=True)

stats_df.sort_values('pts_norm', ascending=False).head(10)

Unnamed: 0,player_id,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,min,fgm,...,ftm_norm,fta_norm,oreb_norm,dreb_norm,reb_norm,ast_norm,stl_norm,tov_norm,blk_norm,pts_norm
4284,76375,1961-62,0,1610612744,PHW,25.0,80,,3882.0,1597,...,0.994048,1.0,,,0.954863,0.164948,,,,1.0
4285,76375,1962-63,0,1610612744,SFW,26.0,80,,3806.0,1463,...,0.785714,0.816581,,,0.905537,0.236254,,,,0.890047
12946,893,1986-87,0,1610612741,CHI,24.0,82,82.0,3281.0,1098,...,0.991667,0.713133,0.282794,0.237624,0.200093,0.323883,0.784053,0.586207,0.274123,0.754778
4283,76375,1960-61,0,1610612744,PHW,24.0,79,,3773.0,1251,...,0.632143,0.773294,,,1.0,0.127148,,,,0.752792
4286,76375,1963-64,0,1610612744,SFW,27.0,80,,3689.0,1204,...,0.642857,0.745415,,,0.83155,0.34622,,,,0.731695
12947,893,1987-88,0,1610612741,CHI,25.0,82,82.0,3311.0,1069,...,0.860714,0.630961,0.236797,0.279028,0.208934,0.416667,0.860465,0.543103,0.287281,0.711839
3360,977,2005-06,0,1610612747,LAL,27.0,80,80.0,3277.0,978,...,0.828571,0.60088,0.120954,0.318632,0.197766,0.309278,0.488372,0.538793,0.065789,0.702904
15645,77498,1974-75,0,1610612746,BUF,23.0,82,,3539.0,1095,...,0.763095,0.584006,0.522998,0.763276,0.537459,0.15378,0.305648,,0.381579,0.702656
2,76003,1971-72,0,1610612749,MIL,25.0,81,81.0,3583.0,1159,...,0.6,0.537051,,,0.626338,0.317869,,,,0.700422
9793,201935,2018-19,0,1610612745,HOU,29.0,78,78.0,2867.0,843,...,0.897619,0.629494,0.112436,0.406841,0.241042,0.503436,0.524917,0.834052,0.127193,0.699429
