In [2]:
import pandas as pd

# Retrieve HTML table data
url = 'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'
html = pd.read_html(url, header = 0)
df2023 = html[0]

In [3]:
raw = df2023.drop(df2023[df2023.Age == 'Age'].index)
raw

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,...,.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
1,2,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,...,.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,3,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,...,.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
3,4,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,...,.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
4,5,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,...,.750,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,535,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,...,.692,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4
701,536,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,...,.886,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2
702,537,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,...,.833,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4
703,538,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,...,.686,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5


**Column Descriptions**
Column | Description
---|---
Rk | Rank
Pos | Position
Age | Player's age on February 1 of the season
Tm | Team
G | Games
GS | Games Started
MP | Minutes Played Per Game
FG | Field Goals Per Game
FGA | Field Goal Attempts Per Game
FG% | Field Goal Percentage
3P | 3-Point Field Goals Per Game
3PA | 3-Point Field Goal Attempts Per Game
3P% | FG% on 3-Pt FGAs.
2P | 2-Point Field Goals Per Game
2PA | 2-Point Field Goal Attempts Per Game
2P% | FG% on 2-Pt FGAs.
eFG% | Effective Field Goal Percentage
FT | Free Throws Per Game
FTA | Free Throw Attempts Per Game
FT% | Free Throw Percentage
ORB | Offensive Rebounds Per Game
DRB | Defensive Rebounds Per Game
TRB | Total Rebounds Per Game
AST | Assists Per Game
STL | Steals Per Game
BLK | Blocks Per Game
TOV | Turnovers Per Game
PF | Personal Fouls Per Game
PTS | Points Per Game

**Data Cleaning**

In [4]:
# Data Dimension
raw.shape

(679, 30)

In [5]:
# Dataframe contents
raw.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,...,0.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
1,2,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,3,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,...,0.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
3,4,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,...,0.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
4,5,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,...,0.75,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0


In [6]:
# Check for missing values
raw.isnull().sum()

Rk         0
Player     0
Pos        0
Age        0
Tm         0
G          0
GS         0
MP         0
FG         0
FGA        0
FG%        3
3P         0
3PA        0
3P%       24
2P         0
2PA        0
2P%        7
eFG%       3
FT         0
FTA        0
FT%       37
ORB        0
DRB        0
TRB        0
AST        0
STL        0
BLK        0
TOV        0
PF         0
PTS        0
dtype: int64

In [7]:
# Replace missing values with 0
df = raw.fillna(0)
df.isnull().sum()

Rk        0
Player    0
Pos       0
Age       0
Tm        0
G         0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       0
2P        0
2PA       0
2P%       0
eFG%      0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
dtype: int64

In [8]:
# Drop "Rank" column
df = df.drop(["Rk"], axis = 1)
df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,.485,...,.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,.597,...,.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,.540,...,.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
3,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,.427,...,.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
4,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,.470,...,.750,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,.545,...,.692,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4
701,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,.429,...,.886,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2
702,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,.593,...,.833,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4
703,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,.627,...,.686,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5


In [9]:
# Write to csv file
df.to_csv("nba2023.csv", index=False)

**Exploratory Data Analysis**

In [10]:
# Read data
df = pd.read_csv("nba2023.csv")
df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,0.485,...,0.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,0.540,...,0.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
3,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,0.427,...,0.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
4,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,0.470,...,0.750,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,0.545,...,0.692,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4
675,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,0.429,...,0.886,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2
676,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,0.593,...,0.833,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4
677,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,0.627,...,0.686,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5


In [11]:
# Overview of data types of each column in the dataframe
df.dtypes

Player     object
Pos        object
Age         int64
Tm         object
G           int64
GS          int64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

Which player scored the most points Per Game (PTS)?

In [12]:
player_max_pts = df[df.PTS == df.PTS.max()]
player_max_pts

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
184,Joel Embiid,C,28,PHI,66,66,34.6,11.0,20.1,0.548,...,0.857,1.7,8.4,10.2,4.2,1.0,1.7,3.4,3.1,33.1


In [14]:
# What team is the player from? 
player_max_pts.Tm

# Which position is the player playing at?
player_max_pts.Pos

# How many games did the player played in the season?
player_max_pts.G


184    66
Name: G, dtype: int64

In [15]:
# Which players scored more than 20 Points Per Game (PTS)?
df[df.PTS > 20]


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
2,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,0.54,...,0.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
12,Giannis Antetokounmpo,PF,28,MIL,63,63,32.1,11.2,20.3,0.553,...,0.645,2.2,9.6,11.8,5.7,0.8,0.8,3.9,3.1,31.1
24,LaMelo Ball,PG,21,CHO,36,36,35.2,8.2,20.0,0.411,...,0.836,1.2,5.3,6.4,8.4,1.3,0.3,3.6,3.3,23.3
29,Desmond Bane,SG,24,MEM,58,58,31.7,7.8,16.2,0.479,...,0.883,0.7,4.3,5.0,4.4,1.0,0.4,2.2,2.6,21.5
44,Bradley Beal,SG,29,WAS,50,50,33.5,8.9,17.6,0.506,...,0.842,0.8,3.1,3.9,5.4,0.9,0.7,2.9,2.1,23.2
63,Bojan Bogdanović,PF,33,DET,59,59,32.1,7.3,14.9,0.488,...,0.884,0.6,3.2,3.8,2.6,0.6,0.1,2.3,1.9,21.6
66,Devin Booker,SG,26,PHO,53,53,34.6,9.9,20.1,0.494,...,0.855,0.9,3.7,4.5,5.5,1.0,0.3,2.7,3.0,27.8
77,Mikal Bridges,SF-SG,26,TOT,83,83,35.7,7.1,15.3,0.468,...,0.895,1.0,3.4,4.4,3.3,1.1,0.7,1.5,1.9,20.1
79,Mikal Bridges,SG,26,BRK,27,27,34.2,8.9,18.6,0.475,...,0.894,0.9,3.6,4.5,2.7,1.0,0.6,1.8,1.6,26.1
85,Jaylen Brown,SF,26,BOS,67,67,35.9,10.1,20.6,0.491,...,0.765,1.2,5.7,6.9,3.5,1.1,0.4,2.9,2.6,26.6


In [17]:
# Which player had the highest 3-Point Field Goals Per Game (3P)?
df[df["3P"] == df["3P"].max()]

# Which player had the highest Assists Per Game (AST)?
df[df["AST"] == df["AST"].max()]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
245,James Harden,PG,33,PHI,58,58,36.8,6.4,14.5,0.441,...,0.867,0.7,5.4,6.1,10.7,1.2,0.5,3.4,1.9,21.0


**GroupBy() function**

In [18]:
# Which player scored the highest (PTS) in the Los Angeles Lakers?
la_lakers = df.groupby("Tm").get_group("LAL")
la_lakers[la_lakers.PTS == la_lakers.PTS.max()]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
306,LeBron James,PF,38,LAL,55,54,35.5,11.1,22.2,0.5,...,0.768,1.2,7.1,8.3,6.8,0.9,0.6,3.2,1.6,28.9


Of the 5 positions, which position scores the most points?

In [19]:
# Group players by their positions
df.groupby("Pos").PTS.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,139.0,7.614388,5.542413,0.0,3.75,6.1,10.25,33.1
PF,121.0,9.400826,7.131602,0.0,4.5,7.6,11.2,31.1
PF-C,1.0,6.4,,6.4,6.4,6.4,6.4,6.4
PF-SF,1.0,7.5,,7.5,7.5,7.5,7.5,7.5
PG,116.0,10.349138,7.785276,0.0,4.525,7.95,14.45,32.4
PG-SG,2.0,16.05,15.62706,5.0,10.525,16.05,21.575,27.1
SF,133.0,8.33609,6.22746,0.0,3.9,6.7,11.1,30.1
SF-PF,1.0,6.6,,6.6,6.6,6.6,6.6,6.6
SF-SG,2.0,12.1,11.313708,4.1,8.1,12.1,16.1,20.1
SG,161.0,8.757143,6.247607,0.0,4.5,7.3,11.1,28.3


Of the 5 positions, which position scores the most points?

In [20]:
# Group players by their positions
df.groupby('Pos').PTS.describe()

# Show only the 5 traditional positions (those having combo positions will be removed from the analysis)
positions = ['C','PF','SF','PG','SG']
pos = df[df["Pos"].isin(positions)]
pos

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,0.485,...,0.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,75,75,34.6,8.0,14.9,0.540,...,0.806,2.5,6.7,9.2,3.2,1.2,0.8,2.5,2.8,20.4
3,Ochai Agbaji,SG,22,UTA,59,22,20.5,2.8,6.5,0.427,...,0.812,0.7,1.3,2.1,1.1,0.3,0.3,0.7,1.7,7.9
4,Santi Aldama,PF,22,MEM,77,20,21.8,3.2,6.8,0.470,...,0.750,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,0.545,...,0.692,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4
675,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,0.429,...,0.886,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2
676,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,0.593,...,0.833,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4
677,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,0.627,...,0.686,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5


In [21]:
# Take a look at the descriptive statistics
pos.groupby("Pos").describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,G,G,...,PF,PF,PTS,PTS,PTS,PTS,PTS,PTS,PTS,PTS
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Pos,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C,139.0,26.215827,4.242235,19.0,23.0,25.0,29.0,42.0,139.0,43.503597,...,2.4,3.6,139.0,7.614388,5.542413,0.0,3.75,6.1,10.25,33.1
PF,121.0,26.512397,4.837554,19.0,23.0,26.0,30.0,39.0,121.0,45.07438,...,2.2,5.0,121.0,9.400826,7.131602,0.0,4.5,7.6,11.2,31.1
PG,116.0,26.87931,4.736822,19.0,23.0,26.0,31.0,37.0,116.0,42.413793,...,2.125,3.3,116.0,10.349138,7.785276,0.0,4.525,7.95,14.45,32.4
SF,133.0,25.37594,3.508866,19.0,23.0,25.0,27.0,36.0,133.0,43.015038,...,2.1,3.4,133.0,8.33609,6.22746,0.0,3.9,6.7,11.1,30.1
SG,161.0,25.254658,4.114425,19.0,22.0,25.0,28.0,36.0,161.0,41.714286,...,1.9,3.3,161.0,8.757143,6.247607,0.0,4.5,7.3,11.1,28.3


**Histograms**

In [22]:
# Create a subset dataframe
pts = df[["Pos", "PTS"]]

positions = ['C','PF','SF','PG','SG']
pts = pts[ pts['Pos'].isin(positions)]

pts

Unnamed: 0,Pos,PTS
0,C,9.2
1,C,8.6
2,C,20.4
3,SG,7.9
4,PF,9.0
...,...,...
674,PF,4.4
675,PG,26.2
676,C,4.4
677,C,6.5


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

h = sns.FacetGrid(pts, col = )