In [17]:
#cd nba-players-stats
import logging as logger

In [11]:
import pandas as pd
player_data_df = pd.read_csv('player_data.csv')
season_stats_df = pd.read_csv('Seasons_Stats.csv', index_col=0)

In [12]:
#season_stats_df.head()
#season_stats_df.info()

player_data_df.height.fillna("0-0", inplace=True)
player_data_df.weight.fillna(0, inplace=True)
player_data_df = player_data_df.assign(height_metric=player_data_df.height.str[:1].astype(int) * 30.48 + player_data_df.height.str[2:].astype(int) * 30.48 / 12)
player_data_df = player_data_df.assign(weight_metric=player_data_df.weight * 0.453592)

In [13]:
season_stats_df[['Year', 'MP']].describe()

Unnamed: 0,Year,MP
count,24624.0,24138.0
mean,1992.594989,1209.720317
std,17.429594,941.146575
min,1950.0,0.0
25%,1981.0,340.0
50%,1996.0,1053.0
75%,2007.0,1971.0
max,2017.0,3882.0


In [18]:
# Filter season dataset by minutes played & year
season_stats_df = season_stats_df[(season_stats_df.MP > 1000) & (season_stats_df.Year > 1980)]

# Drop players' partial seasons (leave the total)
for temp_tuple in season_stats_df[season_stats_df.Tm == 'TOT'].itertuples():
    part_seasons = season_stats_df[
        (season_stats_df.Player == temp_tuple.Player) & (season_stats_df.Year == temp_tuple.Year) & (season_stats_df.Tm != 'TOT')
    ]
    if len(part_seasons) > 0:
        logger.info(f'Deleting {len(part_seasons)} partial seasons for {temp_tuple.Player} in {temp_tuple.Year}')
        season_stats_df = season_stats_df.drop(part_seasons.index)

# Assign height/weight data
season_stats_df = season_stats_df.assign(height=0)
season_stats_df = season_stats_df.assign(weight=0)

for itertup in season_stats_df.itertuples():
    pname = itertup.Player
    if pname[-1] == '*':
        pname = pname[:-1]
    player_data = player_data_df[player_data_df.name == pname]
    if len(player_data) > 0:
        season_stats_df.loc[itertup.Index, 'height'] = player_data.height_metric.values[0]
        season_stats_df.loc[itertup.Index, 'weight'] = player_data.weight_metric.values[0]
    else:
        player_data = player_data_df[
            (player_data_df.name.str.startswith(pname))
            & (player_data_df.year_start <= itertup.Year)
            & (player_data_df.year_end >= itertup.Year)
        ]
        if len(player_data) > 1:
            logger.warning(f'MULTIPLE NAMES FOUND STARTING WITH {pname}')
        logger.info(f'Populating player data for {pname} in {itertup.Year} with {player_data.name}')
        season_stats_df.loc[itertup.Index, 'height'] = player_data.height_metric.values[0]
        season_stats_df.loc[itertup.Index, 'weight'] = player_data.weight_metric.values[0]

# Clean up the rest of the data & reset the index
season_stats_df.GS = season_stats_df.GS.fillna(0)
season_stats_df['3P%'] = season_stats_df['3P%'].fillna(0)
season_stats_df = season_stats_df.assign(pos_simple=season_stats_df.Pos.str[:2])
season_stats_df.pos_simple = season_stats_df.pos_simple.str.replace('-', '')
season_stats_df = season_stats_df.drop(['blanl', 'blank2'], axis=1)
season_stats_df = season_stats_df.reset_index(drop=True)

season_stats_df.to_csv('Seasons_Stats_proc.csv')

In [19]:
#dowloaded dataset directly 
proc_stats_df = pd.read_csv('Seasons_Stats_proc.csv', index_col=0)

In [20]:
proc_stats_df = proc_stats_df.assign(bmi=proc_stats_df.weight / ((proc_stats_df.height/100) ** 2))

In [21]:
import plotly.express as px
import numpy as np

In [32]:
recent_stats_df = proc_stats_df[proc_stats_df['Year'] > 2014]

ht_limits = [0, 190, 200, 210, np.inf]
ht_labels = [str(i) + '_' + str(ht_limits[i]) + ' to ' + str(ht_limits[i+1]) for i in range(len(ht_limits)-1)]
recent_stats_df = recent_stats_df.assign(
    height_bins=pd.cut(recent_stats_df.height, bins=ht_limits, labels=ht_labels, right=False)
)
bmi_limits = [0, 22.5, 24, 25.5, np.inf]
bmi_labels = [str(i) + '_' + str(bmi_limits[i]) + ' to ' + str(bmi_limits[i+1]) for i in range(len(bmi_limits)-1)]
recent_stats_df = recent_stats_df.assign(
    bmi_bins=pd.qcut(recent_stats_df.bmi, q=4)
)

In [34]:
fig = px.scatter(
    recent_stats_df, x='height', y='bmi',
    color='pos_simple', category_orders=dict(pos_simple=['PG', 'SG', 'SF', 'PF', 'C']),
    marginal_x="histogram",  hover_name='Player')
fig.show()

In [35]:
ht_n_seasons = recent_stats_df.groupby('height_bins').count()['Year']
ht_n_unique_pl = recent_stats_df.groupby('height_bins').nunique('Player')['Player']
print(ht_n_seasons / ht_n_unique_pl)
bmi_n_seasons = recent_stats_df.groupby('bmi_bins').count()['Year']
bmi_n_unique_pl = recent_stats_df.groupby('bmi_bins').nunique('Player')['Player']
print(bmi_n_seasons / bmi_n_unique_pl)

height_bins
0_0 to 190      2.177778
1_190 to 200    2.067797
2_200 to 210    2.125786
3_210 to inf    2.208955
dtype: float64
bmi_bins
(19.930999999999997, 23.716]    2.058252
(23.716, 24.827]                2.170213
(24.827, 25.91]                 2.049505
(25.91, 30.54]                  2.252747
dtype: float64


In [36]:
fig = px.scatter(
    recent_stats_df, x='bmi', y='PER', hover_name='Player'
    , color='Year', color_continuous_scale=px.colors.sequential.Teal,
)
fig.show()

In [45]:
fig = px.box(
    proc_stats_df, x='height_bins', y='PER', color='bmi_bins', hover_name='Player',
    category_orders=dict(height_bins=ht_labels, bmi_bins=bmi_labels))
fig.show()

In [26]:
fig = px.histogram(
    proc_stats_df, x='pos_simple', facet_row='bmi_bins', facet_col='height_bins', color='pos_simple',
    category_orders=dict(height_bins=ht_labels, bmi_bins=bmi_labels, pos_simple=['PG', 'SG', 'SF', 'PF', 'C']))
fig.show()

In [27]:
age_limits = [0, 23, 25, 27, 29, 31, np.inf]
age_labels = [str(i) + '_' + str(age_limits[i]) + ' to ' + str(age_limits[i+1]) for i in range(len(age_limits)-1)]
proc_stats_df = proc_stats_df.assign(
    age_bins=pd.cut(proc_stats_df.Age, bins=age_limits, labels=age_labels, right=False)
)

In [28]:
fig = px.box(
    proc_stats_df, x='age_bins', y='PER', color='bmi_bins', hover_name='Player',
    category_orders=dict(height_bins=ht_labels, bmi_bins=bmi_labels))
fig.show()

In [56]:
import plotly.graph_objects as go

In [61]:
fig = px.box(
    proc_stats_df, x='age_bins', y='PER', color='bmi_bins', hover_name='Player', facet_row='pos_simple',
    category_orders=dict(bmi_bins=bmi_labels, age_bins=age_labels, pos_simple=['PG', 'SG', 'SF', 'PF', 'C'])
)
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [62]:
fig = px.box(
    proc_stats_df, x='height_bins', y='PER', color='age_bins', hover_name='Player',
    category_orders=dict(height_bins=ht_labels, age_bins=age_labels)
)
fig.show()

In [63]:
yr_limits = [0, 1995, 2007, np.inf]
yr_labels = [str(i) + '_' + str(yr_limits[i]) + ' to ' + str(yr_limits[i+1]) for i in range(len(yr_limits)-1)]
proc_stats_df = proc_stats_df.assign(
    year_bins=pd.cut(proc_stats_df.Year, bins=yr_limits, labels=yr_labels, right=False)
)

In [64]:
fig = px.box(
    proc_stats_df, x='height_bins', y='PER', color='age_bins', hover_name='Player', facet_row='year_bins'
    , category_orders=dict(height_bins=ht_labels, age_bins=age_labels, year_bins=yr_labels)
)
fig.show()