### Overview
The purpose of this exercise is to rank NBA seasons and determine which player has had the best season ever.

To-dos:
- Watch this video
- Add player names
- Add player positions
- Bias weighting by position

### Import data

In [14]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import requests
from nba_api.stats.static import players
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import playercareerstats
from concurrent.futures import ThreadPoolExecutor, as_completed

In [9]:
# Load in header parameters to keep dataframe running
headers  = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://stats.nba.com/',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [5]:
# Get a full list of players
nba_players = players.get_players()
df_players = pd.DataFrame(nba_players)

Total number of players: 5034


In [8]:
# Get a list of all player IDs from the df_players dataframe
player_ids = df_players['id'].to_list()

5034


In [15]:
# Create function that gets player info data for a list of player IDs
def get_player_info(player_ids, headers, max_workers=10):
    def fetch_data(nba_player_id):
        try:
            player_info = commonplayerinfo.CommonPlayerInfo(player_id=nba_player_id, headers=headers, timeout=100)
            df = player_info.common_player_info.get_data_frame()
            return df
        except Exception as e:
            print(f"Error fetching data for player ID {nba_player_id}: {e}")
            return None

    player_info = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_data, nba_player_id) for nba_player_id in player_ids]
        for future in as_completed(futures):
            player = future.result()
            if player is not None:
                player_info.append(player)

    df_player_info = pd.concat(player_info, ignore_index=True)
    return df_player_info

In [16]:
# Run function to get player info data for all player IDs
df_player_info = get_player_info(player_ids, headers)

Error fetching data for player ID 1626122: Expecting value: line 1 column 1 (char 0)


In [19]:
print(df_player_info.columns)

Index(['PERSON_ID', 'FIRST_NAME', 'LAST_NAME', 'DISPLAY_FIRST_LAST',
       'DISPLAY_LAST_COMMA_FIRST', 'DISPLAY_FI_LAST', 'PLAYER_SLUG',
       'BIRTHDATE', 'SCHOOL', 'COUNTRY', 'LAST_AFFILIATION', 'HEIGHT',
       'WEIGHT', 'SEASON_EXP', 'JERSEY', 'POSITION', 'ROSTERSTATUS',
       'GAMES_PLAYED_CURRENT_SEASON_FLAG', 'TEAM_ID', 'TEAM_NAME',
       'TEAM_ABBREVIATION', 'TEAM_CODE', 'TEAM_CITY', 'PLAYERCODE',
       'FROM_YEAR', 'TO_YEAR', 'DLEAGUE_FLAG', 'NBA_FLAG', 'GAMES_PLAYED_FLAG',
       'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER', 'GREATEST_75_FLAG'],
      dtype='object')


In [95]:
# Make a list of all the stats that will be important to track
stats = ['min', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'ast', 'stl', 'tov', 'blk', 'pts']
player_names_header = ['player_id', 'player_name', 'season_id']

In [96]:
# Read in per-season stats and player information
stats_csv = 'nba-stats-csv/player_general_traditional_per_game_data.csv'
player_name_csv = 'nba-stats-csv/player_info.csv'

stats_df = pd.read_csv(stats_csv, header=0)
player_name_df = pd.read_csv(player_name_csv, header=0)

### Scrub data

In [97]:
# Drop null value rows
stats_df.dropna(how='all', inplace=True)

# Remove any seasons with fewer than 10 games played
stats_df = stats_df[stats_df['gp'] >= 10]

# Add player names to stats dataframe
stats_df = pd.merge(
    player_name_df
    , stats_df
    , on=['player_id', 'season_id']
    , how='outer'
).drop_duplicates()

# Drop null value rows
stats_df.dropna(how='all', inplace=True)

stats_df


Unnamed: 0,player_id,player_name,season_id,gp,age,min,fgm,fga,fg_pct,fg3m,...,ftm,fta,ft_pct,oreb,dreb,ast,tov,stl,blk,pts
0,2,Byron Scott,1996-97,79.0,36.0,18.9,2.1,4.8,0.430,0.9,...,1.6,1.9,0.841,0.3,1.2,1.3,0.7,0.6,0.2,6.7
3,3,Grant Long,1996-97,65.0,31.0,18.5,1.9,4.2,0.447,0.3,...,1.0,1.3,0.750,1.4,2.1,0.6,0.7,0.7,0.1,5.0
6,3,Grant Long,1997-98,40.0,32.0,18.6,1.3,2.9,0.427,0.0,...,1.0,1.4,0.719,1.4,2.3,0.6,0.6,0.7,0.3,3.5
9,3,Grant Long,1998-99,50.0,33.0,27.6,3.0,7.2,0.421,0.1,...,3.7,4.7,0.783,2.0,3.9,1.1,1.5,1.1,0.3,9.8
12,3,Grant Long,1999-00,42.0,34.0,22.0,1.8,4.0,0.443,0.0,...,1.3,1.7,0.775,2.0,3.5,1.0,1.2,1.1,0.2,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29693,1629151,,2018-19,42.0,28.0,10.8,1.4,3.0,0.452,0.9,...,0.4,0.5,0.789,0.2,1.3,0.5,0.4,0.1,0.1,4.0
29694,1629164,,2018-19,16.0,23.0,3.6,0.4,1.4,0.261,0.1,...,0.6,0.7,0.818,0.1,0.1,0.9,0.2,0.0,0.0,1.4
29695,1629234,,2018-19,23.0,22.0,4.9,0.7,1.1,0.577,0.0,...,0.5,0.6,0.846,0.3,1.2,0.3,0.3,0.1,0.2,1.8
29696,1629244,,2018-19,19.0,24.0,13.7,1.7,4.1,0.423,1.1,...,0.4,0.5,0.889,0.2,1.4,0.7,0.3,0.3,0.1,5.0


### Normalize data

We need to normalize our data. Scoring 22 ppg in 1993 isn't the same as scoring 22 ppg in 2023. The modern NBA features more attempts
 per game and higher rates of accuracy. Additionally, the introduction and wide-spread adoption of the three point shot has increased 
 scoring over time. In order to compare across seasons, we need to normalize data to compare performance fairly.

We're going to:
- Create a column that takes a raw statistical value
- Subtracts the minumum value from that season
- Divides by difference between max and min values from that season to normalize the stat.

In [98]:
# Create our calculations for normalizing statistics across an entire dataframe

def normalize_col(col):
    normalized_input = (col - col.min()) / (col.max() - col.min())
    return normalized_input

def normalize_df(df):
    for col in stats:
        if col in df.columns:
            df['{}_norm'.format(col)] = normalize_col(df[col])
        else:
            print(f"Column '{col}' not found in DataFrame")
    return df

In [100]:
# Apply our new normalize_df function to the stats_df dataframe,
stats_df = normalize_df(stats_df).reset_index(drop=True)

stats_df.sort_values('pts_norm', ascending=False).head(10)

Unnamed: 0,player_id,player_name,season_id,gp,age,min,fgm,fga,fg_pct,fg3m,...,fg3a_norm,ftm_norm,fta_norm,oreb_norm,dreb_norm,ast_norm,stl_norm,tov_norm,blk_norm,pts_norm
7999,201935,,2018-19,78.0,29.0,36.8,10.8,24.5,0.442,4.8,...,1.0,1.0,0.839695,0.117647,0.508772,0.641026,0.689655,0.877193,0.179487,1.0
2300,977,Kobe Bryant,2005-06,80.0,27.0,41.0,12.2,27.2,0.45,2.3,...,0.492424,0.896907,0.778626,0.132353,0.385965,0.384615,0.62069,0.54386,0.102564,0.980556
2036,947,Allen Iverson,2005-06,72.0,31.0,43.1,11.3,25.3,0.447,1.0,...,0.234848,0.969072,0.877863,0.088235,0.22807,0.632479,0.655172,0.596491,0.025641,0.913889
2763,1503,Tracy McGrady,2002-03,75.0,24.0,39.3,11.1,24.2,0.457,2.3,...,0.454545,0.793814,0.740458,0.235294,0.429825,0.470085,0.586207,0.45614,0.205128,0.888889
7198,201142,Kevin Durant,2013-14,81.0,25.0,38.5,10.5,20.8,0.503,2.4,...,0.462121,0.896907,0.755725,0.102941,0.587719,0.470085,0.448276,0.614035,0.179487,0.886111
7608,201566,Russell Westbrook,2016-17,81.0,28.0,34.6,10.2,24.0,0.425,2.5,...,0.545455,0.907216,0.793893,0.25,0.789474,0.888889,0.551724,0.947368,0.102564,0.875
2301,977,Kobe Bryant,2006-07,77.0,28.0,40.8,10.6,22.8,0.463,1.8,...,0.393939,0.896907,0.763359,0.147059,0.412281,0.461538,0.482759,0.578947,0.128205,0.875
5329,2544,LeBron James,2005-06,79.0,21.0,42.5,11.1,23.1,0.48,1.6,...,0.363636,0.783505,0.78626,0.132353,0.535088,0.564103,0.551724,0.578947,0.205128,0.869444
2032,947,Allen Iverson,2001-02,60.0,27.0,43.7,11.1,27.8,0.398,1.3,...,0.340909,0.814433,0.748092,0.102941,0.333333,0.470085,0.965517,0.701754,0.051282,0.869444
2031,947,Allen Iverson,2000-01,71.0,26.0,41.9,10.7,25.5,0.42,1.4,...,0.325758,0.845361,0.770992,0.102941,0.27193,0.393162,0.862069,0.578947,0.076923,0.861111
