In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nba-basketball-player-statistics/player.csv
/kaggle/input/nba-basketball-player-statistics/teams.csv
/kaggle/input/nba-basketball-player-statistics/mvps.csv


Questions for 2020-2021 season

Individual
1. Which players had the highest PPG for each position (PG, SG, SF, PF, C)?
2. What is the relationship between AST and TOV for players with 24 + MP?
3. Which players led the league in STOCKS (stl + blk)?
4. How do age and PTS relate to each other?
5. Which players led the league in opportunities created (FTA + ORB)?


<br />






Team
1. What is the average age for each team of the players playing 20+ minutes per game?
2. Which team had the most players average over 24 minutes per game?
3. What is the average number of assists per game for each team?
4. What is the Herfindahl Index for assists for each team?
5. Of the top 3 scorers (total points) on each team, what percentage of team PPG did they contribute?


<br />



Deep Dive
1. Are FT% and 3P% correlated?
2. Group players based on PPG, REBS, ASTS, STL, BLK, FG% to find some pattern in players (use players who average 24+ mins/game) 



In [2]:
import plotly.express as px

In [3]:
df = pd.read_csv('../input/nba-basketball-player-statistics/player.csv')
df.columns
df.drop(columns=['Unnamed: 0'], inplace = True)

df = df[df["Year"] == 2021]

In [4]:
def update_position(position):
    if "-" not in position:
        return position
    return position.split("-")[0]
df['position'] = df['Pos'].apply(lambda x: update_position(x))

df = df[df['position'] != 'Pos']

df.isnull().sum()

def is_float(x):
    try:
        float(x)
    except ValueError:
        return False
    return True

def is_str(x):
    try:
        str(x)
    except ValueError:
        return False
    return True

def convert_to_numeric(df, col):
    df = df[df[col].apply(lambda x: is_float(x))]
    df[col] = df[col].astype(float)
    return df

def convert_to_string(df, col):
    df = df[df[col].apply(lambda x: is_str(x))]
    df[col] = df[col].astype(str)
    return df




#updating variable types

df = convert_to_numeric(df, "PTS")
df = convert_to_string(df, "position")
df = convert_to_string(df, "Player")
df = convert_to_numeric(df, "MP")
df = convert_to_numeric(df, "STL")
df = convert_to_numeric(df, "BLK")
df = convert_to_numeric(df, "Age")
df = convert_to_numeric(df, "FTA")
df = convert_to_numeric(df, "ORB")
df = convert_to_numeric(df, 'AST')
df = convert_to_numeric(df, 'TOV')
df = convert_to_string(df, "Tm")
df = convert_to_numeric(df, "G")
df = convert_to_numeric(df, "FTA")
df = convert_to_numeric(df, "FG%")
df = convert_to_numeric(df, "TRB")



In [5]:
def top_players_pos_cat(df, cat):
    positions = ["PG", "SG", "SF", "PF", "C"]
    df = df.sort_values(by = cat, ascending = False)
    #print(df.head(5))
    to_return = pd.DataFrame(columns = df.columns)
    for pos in positions:
        #print(type(df[df['position'] == pos].iloc[[0]]))
        to_return = to_return.append(df[df['position'] == pos].iloc[[0]])
    return to_return
        
        

<a id='another_cell'></a>

Which players had the highest PPG for each position (PG, SG, SF, PF, C)?

In [6]:
players_pts_positions = top_players_pos_cat(df, "PTS")

In [7]:
fig = px.bar(players_pts_positions, x='Player', y='PTS', color="position", title="PPG Leaders by Position")
fig.show()

In [8]:
asts_tos = convert_to_numeric(df, "AST")
asts_tos = convert_to_numeric(asts_tos, "TOV")
asts_tos = asts_tos[asts_tos['MP'] >= 24]


In [9]:
fig = px.scatter(asts_tos, x = 'AST', y = 'TOV', color = 'position', hover_data = ['Player', 'AST', 'TOV'], title= "AST + TOV w/ 24 + minutes played")
fig.add_hline(y=asts_tos['TOV'].mean())
fig.add_vline(x =asts_tos['AST'].mean())
fig.show()

Players averaging more turnovers than assists are primarily non PGs and mostly big men. Of particular notice is the Russell Westbrook outlier, averaging both the most assists and most turnovers of players playing more than 24 minutes. Chris Paul and Jimmy Butler both are high assist players, but also low turnover players. Upon deeper inspection, we can also see Nikola Jokic averaging 3+ more assists per game than the next highest center, KAT.

In [10]:
#3. 
df['STOCKS'] = df['BLK'] + df['STL']
df = df.sort_values(by = 'STOCKS', ascending = False)
top_stocks = df[df['MP']>=24][0:15]
fig = px.bar(top_stocks, x='Player', y='STOCKS', color="position", title="STOCKS Leaders")
fig.show()

In [11]:
fig = px.scatter(df, y = 'PTS', x = 'Age', hover_data = ["Player"])
fig.show()

There is little pattern or symmetry in this distribution. We can see Zion Williamson and Luka Doncic as significant outliers in the 21 and under players, while Lebron James is the only player averaging over 17 ppg at age 35 or older. He is averaging a whopping 25 PPG.

In [12]:
df['shots_created'] = df['FTA'] + df['ORB']
df = df.sort_values(by = 'shots_created', ascending = False)
top_stocks = df[0:10]
fig = px.bar(top_stocks, x='Player', y='shots_created', color="position", title="Shots Created Leaders")
fig.show()

In [13]:
#constructing team stats array
team_stats = []
for team in df['Tm'].unique():
    if team != 'TOT':
        team_stats.append(df[df['Tm'] == team])

What is the average age for each team of the players playing 20+ minutes per game?

In [14]:
players = df['Player'].unique()
age = 0
age_distribution = pd.DataFrame(columns = ['Age'])
for player in players:
    newage = df[df['Player'] == player].iloc[0]['Age']
    age_distribution = age_distribution.append({'Age': newage}, ignore_index = True)
    age +=newage    
age = age/len(players)
print(age.round(1))
fig = px.histogram(age_distribution, x = 'Age', title='Player Age Distribution of NBA 2020-2021 season')
fig.show()

25.6


It would be interesting to see this distribution over the years of the league. It does appear that this is a younger league with the average age being 25.6.

In [15]:
q1_team = pd.DataFrame(columns = ['Tm', 'Age'])
for team in team_stats:
    new_row = {
        'Tm': team['Tm'].iloc[0],
        'Age': team[team['MP'] >= 20]['Age'].mean().round(1)
    }
    q1_team = q1_team.append(new_row, ignore_index = True)

q1_team = q1_team.sort_values(by='Age', ascending = True)
fig = px.bar(q1_team, x = 'Tm', y='Age', title='avg Age of Players having 20+ mins per game')
fig.show()   
    

2. Which teams had the most players average over 20 minutes per game?

In [16]:
q2_team = pd.DataFrame(columns = ['Tm', 'num_Players'])
for team in team_stats:
    players_playing_over = team[team['MP'] >= 20].shape[0]
    new_row = {
                'Tm': team['Tm'].iat[0],
                'num_Players': players_playing_over
        }
    q2_team = q2_team.append(new_row, ignore_index = True)

q2_team = q2_team.sort_values(by='num_Players', ascending = False)
fig = px.bar(q2_team, x = 'Tm', y='num_Players', title="# of Players that Avg 20+ mins/game")
fig.show()
    

In [17]:
q2_team1 = pd.DataFrame(columns = ['Tm', 'num_Players'])
for team in team_stats:
    players_playing_over = team[team['MP'] >= 30].shape[0]
    new_row = {
                'Tm': team['Tm'].iat[0],
                'num_Players': players_playing_over
        }
    q2_team1 = q2_team1.append(new_row, ignore_index = True)

q2_team1 = q2_team1.sort_values(by='num_Players', ascending = False)
fig = px.bar(q2_team1, x = 'Tm', y='num_Players', title="# of Players that Avg 30+ mins/game")
fig.show()

5. Of the top 3 scorers (total points) on each team, what percentage of team PPG did they contribute?

In [18]:
q5_team = pd.DataFrame()
for i in range(len(team_stats)):
    teamcopy = team_stats[i].copy()
    teamcopy['tot_pts'] = teamcopy['PTS'] * teamcopy['G']
    tot_pts_team = teamcopy['tot_pts'].sum()
    top3 = teamcopy.sort_values(by ='tot_pts', ascending = False)[:3]
    top3['percent_total'] = (top3['tot_pts']/tot_pts_team).round(2)
    top3 = top3[['Player', 'percent_total', 'tot_pts', 'Tm']] 
    top3 = top3.reset_index()
    top3['rank'] = (top3.index.values + 1).astype(str)
    top3['top_total'] = top3['percent_total'].sum()
    q5_team = q5_team.append(top3)
    
    
q5_team = q5_team.sort_values(['top_total', 'tot_pts'])
fig = px.bar(q5_team, x = 'Tm', y = 'percent_total', color = 'rank', hover_data = ['Player'])
fig.show()


    
    
    
    

In [19]:
def astpergame():
    ast_game = pd.DataFrame(columns = ['Tm', 'AST'])
    for i in range(len(team_stats)):
        teamcopy = team_stats[i].copy()
        teamcopy['player_tot_ast'] = teamcopy['AST']*teamcopy['G']
        teamcopy['tm_tot_ast'] = (teamcopy['player_tot_ast']).sum()
        new_row = {
                'Tm': teamcopy['Tm'].iat[0],
                'AST': (teamcopy['tm_tot_ast'].iat[0]/72).round(2)
        }
        team_stats[i] = teamcopy
        ast_game = ast_game.append(new_row, ignore_index = True)
    ast_game = ast_game.sort_values(by='AST', ascending = False)
    return ast_game
    

In [20]:
ast_game = astpergame()
fig = px.bar(ast_game, x = 'Tm', y = 'AST', title = "AST/game")
fig.show()

In [21]:
def construct_h_indices():
    herf_ind = pd.DataFrame(columns = ['Tm', 'HI'])
    for i  in range(len(team_stats)):
        ts = team_stats[i].copy()
        ts['ast_percentage'] = ts['player_tot_ast']/ts['tm_tot_ast']
        index = (ts['ast_percentage'] ** 2).sum()
        team_stats[i] = ts
        new_row = {
            'Tm': ts['Tm'].iat[0],
            'HI': index
        }
        herf_ind = herf_ind.append(new_row, ignore_index = True)
    herf_ind = herf_ind.sort_values(by='HI', ascending = False)
    return herf_ind

In [22]:
hi = construct_h_indices()
print(hi.head(5))
fig = px.bar(hi, x = 'Tm', y='HI', title="AST concentration")
fig.show()

     Tm        HI
7   WAS  0.214437
13  DAL  0.166945
4   ATL  0.160178
25  PHO  0.158364
15  POR  0.157997


High usage players like Westbrook, Doncic, Trae Young, CP3 and Lillard round out the top of the list.

Are FT% and 3P% related to each other?

In [23]:
df = convert_to_numeric(df, "FT%")
df = convert_to_numeric(df, "3P%")
fig = px.scatter(df, x = 'FT%', y = '3P%')
fig.show()


Use K-Means to Find Clusters of players with Similar Playing Styles

In [24]:
#player classification

#ppg, rbpg, stl, blk, ast/tov ratio, fg%
# df['AST_TOV'] = df['AST']/( df['TOV'] + 0.0001)

df = df[df['MP'] >= 24]
q1_dd = df[['position', 'Player', 'AST', 'FG%', 'TRB', 'PTS', 'STL', 'BLK']].dropna()

In [25]:
sub = q1_dd.groupby('position').mean().round(2).reset_index()
sub.head()

Unnamed: 0,position,AST,FG%,TRB,PTS,STL,BLK
0,C,2.18,0.54,8.6,14.31,0.77,1.2
1,PF,2.81,0.46,5.83,13.61,0.85,0.62
2,PG,5.96,0.45,4.36,17.29,1.13,0.35
3,SF,2.62,0.45,4.9,14.74,0.92,0.43
4,SG,3.35,0.43,3.85,15.8,0.98,0.4


In [26]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=3, cols=2)
fig.add_trace(
    go.Bar(x=sub['position'], y= sub['PTS'], name = 'PTS'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=sub['position'], y= sub['BLK'], name = 'BLK'),
    row=3, col=2
)

fig.add_trace(
    go.Bar(x=sub['position'], y=sub['STL'], name = 'STL'),
    row=3, col=1
)

fig.add_trace(
    go.Bar(x=sub['position'], y=sub['AST'], name = 'AST'),
    row=1, col=2
)

fig.add_trace(
    go.Bar(x= sub['position'], y= sub['TRB'], name = 'REBS'),
    row=2, col=1
)

fig.add_trace(
    go.Bar(x=sub['position'], y=sub['FG%'], name = 'FG%'),
    row=2, col=2
)

fig.update_layout(legend_title_text = "Per Game")

fig.show()

In [27]:
km = q1_dd[['AST', 'FG%', 'TRB', 'PTS', 'STL', 'BLK']].dropna()

In [28]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=2)
Kmean.fit(km)

Kmean.cluster_centers_

array([[ 2.61144578,  0.46239759,  4.96024096, 11.99457831,  0.86566265,
         0.59578313],
       [ 5.13026316,  0.47710526,  6.35131579, 22.18947368,  1.09078947,
         0.57631579]])

The above output is the 2-centroid 2-means method on the above 6 quantitative stats. There is little difference in blocks, steals and FG% between the centroid means. Points and assists carry the most difference between centroid means. This k-means output is a simplistic way of demonstrating that players who average more points and assists are meaningfully separated from those that do not. The variability in this categorical statistics necessitates the separation into these distinct centroids. 