### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Add player positions
- Bias weighting by position

### Import data

In [21]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed

In [22]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'ast', 'stl', 'tov', 'blk', 'pts'
]
player_names_header = ['player_id', 'player_name', 'season_id']

In [23]:
# Read in per-season stats and player information
career_stats_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_career_stats.csv'
player_info_csv = '/Users/YouCanCallMeAll/code/nbadata/df_player_info.csv'

df_stats = pd.read_csv(career_stats_csv, header=0)
df_player_info = pd.read_csv(player_info_csv, header=0)

In [24]:
# Survey data
df_stats
df_player_info

Unnamed: 0,PERSON_ID,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,LAST_AFFILIATION,HEIGHT,WEIGHT,SEASON_EXP,JERSEY,POSITION,ROSTERSTATUS,GAMES_PLAYED_CURRENT_SEASON_FLAG,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CODE,TEAM_CITY,PLAYERCODE,FROM_YEAR,TO_YEAR,DLEAGUE_FLAG,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GREATEST_75_FLAG
0,1505,Tariq,Abdul-Wahad,Tariq Abdul-Wahad,"Abdul-Wahad, Tariq",T. Abdul-Wahad,tariq-abdul-wahad,1974-11-03T00:00:00,San Jose State,France,San Jose State/France,6-6,235.0,7,9,Forward-Guard,Inactive,N,1610612758,Kings,SAC,kings,Sacramento,tariq_abdul-wahad,1997.0,2003.0,N,Y,Y,1997,1,11,N
1,51,Mahmoud,Abdul-Rauf,Mahmoud Abdul-Rauf,"Abdul-Rauf, Mahmoud",M. Abdul-Rauf,mahmoud-abdul-rauf,1969-03-09T00:00:00,Louisiana State,USA,Louisiana State/USA,6-1,162.0,9,1,Guard,Inactive,N,1610612743,Nuggets,DEN,nuggets,Denver,mahmoud_abdul-rauf,1990.0,2000.0,N,Y,Y,1990,1,3,N
2,203518,Alex,Abrines,Alex Abrines,"Abrines, Alex",A. Abrines,alex-abrines,1993-08-01T00:00:00,FC Barcelona,Spain,FC Barcelona/Spain,6-6,200.0,3,,Guard,Inactive,N,0,,,,,alex_abrines,2016.0,2018.0,N,Y,Y,2013,2,32,N
3,76005,Tom,Abernethy,Tom Abernethy,"Abernethy, Tom",T. Abernethy,tom-abernethy,1954-05-06T00:00:00,Indiana,USA,Indiana/USA,6-7,220.0,5,5,Forward,Inactive,N,1610612744,Warriors,GOS,warriors,Golden State,HISTADD_tom_abernethy,1976.0,1980.0,N,Y,Y,1976,3,43,N
4,76002,Zaid,Abdul-Aziz,Zaid Abdul-Aziz,"Abdul-Aziz, Zaid",Z. Abdul-Aziz,zaid-abdul-aziz,1946-04-07T00:00:00,Iowa State,USA,Iowa State/USA,6-9,235.0,10,54,Center,Inactive,N,1610612745,Rockets,HOU,rockets,Houston,HISTADD_zaid_abdul-aziz,1968.0,1977.0,N,Y,Y,1968,1,5,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5028,678,George,Zidek,George Zidek,"Zidek, George",G. Zidek,george-zidek,1973-08-02T00:00:00,UCLA,USA,UCLA/USA,7-0,250.0,3,,Center,Inactive,N,0,,,,,george_zidek,1995.0,1997.0,N,Y,Y,1995,1,22,N
5029,78648,Bill,Zopf,Bill Zopf,"Zopf, Bill",B. Zopf,bill-zopf,1948-06-07T00:00:00,Duquesne,USA,Duquesne/USA,6-1,170.0,1,6,Guard,Inactive,N,1610612749,Bucks,MIL,bucks,Milwaukee,HISTADD_zip_zopf,1970.0,1970.0,N,Y,Y,1970,2,33,N
5030,78647,Jim,Zoet,Jim Zoet,"Zoet, Jim",J. Zoet,jim-zoet,1953-12-30T00:00:00,Kent State,USA,Kent State/USA,7-1,240.0,1,34,Center,Inactive,N,1610612765,Pistons,DET,pistons,Detroit,HISTADD_jim_zoet,1982.0,1982.0,N,Y,Y,Undrafted,Undrafted,Undrafted,N
5031,78650,Matt,Zunic,Matt Zunic,"Zunic, Matt",M. Zunic,matt-zunic,1919-12-19T00:00:00,George Washington,USA,George Washington/USA,6-3,195.0,1,,Guard,Inactive,N,1610610036,Capitols,WAS,capitols,Washington,HISTADD_matt_zunic,1948.0,1948.0,N,Y,Y,1947,,,N


### Scrub data

In [25]:
# Convert all headers to lowercase
df_stats.columns = df_stats.columns.str.lower()
df_player_info.columns = df_player_info.columns.str.lower()

# Rename the person_id column in df_player_info to player_id
df_player_info.rename(columns={'person_id': 'player_id'}, inplace=True)

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
stats_df = df_stats[df_stats['gp'] >= 10]

# Remove any seasons before 1974, when they began tracking orebs, drebs, steals, and blocks
# Ensure season_id is in a comparable format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: int(x.split('-')[0]))

# Filter the DataFrame to keep only seasons from 1973-74 onwards
df_stats = df_stats[df_stats['season_id'] >= 1973]

# If you want to convert the season_id back to the original format
df_stats['season_id'] = df_stats['season_id'].apply(lambda x: f"{x}-{str(x+1)[-2:]}")

# Convert each stat to per game by dividing by games played
for stat in stats:
    if stat in df_stats.columns:
        df_stats[stat] = df_stats[stat] / df_stats['gp']

# Add player names to stats dataframe
df_stats = pd.merge(
    df_player_info['player_id', 'display_first_last', 'position']
    , df_stats
    , on=['player_id']
    , how='outer'
).drop_duplicates()

# Drop null value rows
df_stats.dropna(how='all', inplace=True)

df_stats


Unnamed: 0,player_id,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts
4,76003,1973-74,0,1610612749,MIL,27.0,81,81.0,43.802469,11.703704,21.716049,0.539,,,,3.641975,5.185185,0.702,3.543210,11.000000,1178.0,4.765432,1.382716,3.493827,,238,27.049383
5,76003,1974-75,0,1610612749,MIL,28.0,65,64.0,42.261538,12.492308,24.369231,0.513,,,,5.000000,6.553846,0.763,2.984615,11.046154,912.0,4.061538,1.000000,3.261538,,205,29.984615
6,76003,1975-76,0,1610612747,LAL,29.0,82,82.0,41.207317,11.146341,21.073171,0.529,,,,5.451220,7.756098,0.703,3.317073,13.548780,1383.0,5.036585,1.451220,4.121951,,292,27.743902
7,76003,1976-77,0,1610612747,LAL,30.0,82,82.0,36.780488,10.829268,18.695122,0.579,,,,4.585366,6.536585,0.701,3.243902,10.048780,1090.0,3.890244,1.231707,3.182927,,262,26.243902
8,76003,1977-78,0,1610612747,LAL,31.0,62,62.0,36.532258,10.693548,19.435484,0.550,,,,4.419355,5.645161,0.783,3.000000,9.919355,801.0,4.338710,1.661290,2.983871,3.354839,182,25.806452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30439,203967,2022-23,0,1610612756,PHX,29.0,37,12.0,14.405405,2.054054,4.810811,0.427,0.729730,1.864865,0.391,0.972973,1.189189,0.818,1.054054,2.783784,142.0,1.540541,0.351351,0.135135,0.972973,69,5.810811
30440,203967,2022-23,0,1610612760,OKC,29.0,20,0.0,13.650000,2.550000,4.950000,0.515,0.900000,2.300000,0.391,1.350000,1.600000,0.844,0.700000,2.550000,65.0,0.850000,0.350000,0.100000,0.950000,30,7.350000
30441,203967,2022-23,0,0,TOT,29.0,57,12.0,14.140351,2.228070,4.859649,0.458,0.789474,2.017544,0.391,1.105263,1.333333,0.829,0.929825,2.701754,207.0,1.298246,0.350877,0.122807,0.964912,99,6.350877
30442,203967,2023-24,0,1610612744,GSW,30.0,64,9.0,17.156250,2.828125,6.062500,0.466,1.156250,3.078125,0.376,1.234375,1.453125,0.849,1.140625,3.281250,283.0,2.250000,0.484375,0.156250,1.218750,112,8.046875


### Normalize data

We need to normalize our data. Scoring 22 ppg in 1993 isn't the same as scoring 22 ppg in 2023. The modern NBA features more attempts
 per game and higher rates of accuracy. Additionally, the introduction and wide-spread adoption of the three point shot has increased 
 scoring over time. In order to compare across seasons, we need to normalize data to compare performance fairly.

We're going to:
- Create a column that takes a raw statistical value
- Subtracts the minumum value from that season
- Divides by difference between max and min values from that season to normalize the stat.

In [26]:
# Create our calculations for normalizing statistics across an entire dataframe

def normalize_col(col):
    normalized_input = (col - col.min()) / (col.max() - col.min())
    return normalized_input

def normalize_df(df):
    for col in stats:
        if col in df.columns:
            df['{}_norm'.format(col)] = normalize_col(df[col])
        else:
            print(f"Column '{col}' not found in DataFrame")
    return df

In [27]:
# Apply our new normalize_df function to the stats_df dataframe,
stats_df = normalize_df(stats_df).reset_index(drop=True)

stats_df.sort_values('pts_norm', ascending=False).head(10)

Unnamed: 0,player_id,season_id,league_id,team_id,team_abbreviation,player_age,gp,gs,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,min_norm,fgm_norm,fga_norm,fg3m_norm,fg3a_norm,ftm_norm,fta_norm,oreb_norm,dreb_norm,ast_norm,stl_norm,tov_norm,blk_norm,pts_norm
4284,76375,1961-62,0,1610612744,PHW,25.0,80,,3882.0,1597,3159,0.506,,,,835,1363,0.613,,,2052.0,192,,,,123,4029,1.0,1.0,1.0,,,0.994048,1.0,,,0.164948,,,,1.0
4285,76375,1962-63,0,1610612744,SFW,26.0,80,,3806.0,1463,2770,0.528,,,,660,1113,0.593,,,1946.0,275,,,,136,3586,0.980422,0.916093,0.876782,,,0.785714,0.816581,,,0.236254,,,,0.890047
12946,893,1986-87,0,1610612741,CHI,24.0,82,82.0,3281.0,1098,2279,0.482,12.0,66.0,0.182,833,972,0.857,166.0,264.0,430.0,377,236.0,125.0,272.0,237,3041,0.845183,0.687539,0.721254,0.029851,0.064202,0.991667,0.713133,0.282794,0.237624,0.323883,0.784053,0.586207,0.274123,0.754778
4283,76375,1960-61,0,1610612744,PHW,24.0,79,,3773.0,1251,2457,0.509,,,,531,1054,0.504,,,2149.0,148,,,,130,3033,0.971922,0.783344,0.777637,,,0.632143,0.773294,,,0.127148,,,,0.752792
4286,76375,1963-64,0,1610612744,SFW,27.0,80,,3689.0,1204,2298,0.524,,,,540,1016,0.531,,,1787.0,403,,,,182,2948,0.950283,0.753914,0.727273,,,0.642857,0.745415,,,0.34622,,,,0.731695
12947,893,1987-88,0,1610612741,CHI,25.0,82,82.0,3311.0,1069,1998,0.535,7.0,53.0,0.132,723,860,0.841,139.0,310.0,449.0,485,259.0,131.0,252.0,270,2868,0.852911,0.66938,0.632246,0.017413,0.051556,0.860714,0.630961,0.236797,0.279028,0.416667,0.860465,0.543103,0.287281,0.711839
3360,977,2005-06,0,1610612747,LAL,27.0,80,80.0,3277.0,978,2173,0.45,180.0,518.0,0.347,696,819,0.85,71.0,354.0,425.0,360,147.0,30.0,250.0,233,2832,0.844152,0.612398,0.687678,0.447761,0.503891,0.828571,0.60088,0.120954,0.318632,0.309278,0.488372,0.538793,0.065789,0.702904
15645,77498,1974-75,0,1610612746,BUF,23.0,82,,3539.0,1095,2138,0.512,,,,641,796,0.805,307.0,848.0,1155.0,179,92.0,174.0,,278,2831,0.911643,0.685661,0.676592,,,0.763095,0.584006,0.522998,0.763276,0.15378,0.305648,,0.381579,0.702656
2,76003,1971-72,0,1610612749,MIL,25.0,81,81.0,3583.0,1159,2019,0.574,,,,504,732,0.689,,,1346.0,370,,,,235,2822,0.922978,0.725736,0.638898,,,0.6,0.537051,,,0.317869,,,,0.700422
9793,201935,2018-19,0,1610612745,HOU,29.0,78,78.0,2867.0,843,1909,0.442,378.0,1028.0,0.368,754,858,0.879,66.0,452.0,518.0,586,158.0,58.0,387.0,244,2818,0.738537,0.527865,0.604054,0.940299,1.0,0.897619,0.629494,0.112436,0.406841,0.503436,0.524917,0.834052,0.127193,0.699429
