In [254]:
import numpy as np
import pandas as pd

#raw data as a csv file
filepath = 'data/NBASTATS2023.csv' 

#display all rows/columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#read csv into specific dataframe
df = pd.read_csv(filepath)

#shows how many players and fields we are working with

print(df.shape) 

#uncomment below to see full dataset
#print(df)

#check for missing values in our dataframe
#ensure dataset was formatted and parsed ok

missing_values = df.isnull().sum()

print(missing_values)

#retrieve values for a specific player
print("Bam Adebayo Stats \n")
df.loc[df['NAME'] == 'Bam Adebayo']




(609, 29)
RANK     609
NAME       0
TEAM       0
POS        0
AGE        0
GP         0
MPG        0
USG%       0
TO%        1
FTA        0
FT%        0
2PA        0
2P%        0
3PA        0
3P%        0
eFG%       3
TS%        3
PPG        0
RPG        0
APG        0
SPG        0
BPG        0
TPG        0
P+R        0
P+A        0
P+R+A      0
VI         0
ORtg      24
DRtg      23
dtype: int64
Bam Adebayo Stats 



Unnamed: 0,RANK,NAME,TEAM,POS,AGE,GP,MPG,USG%,TO%,FTA,FT%,2PA,2P%,3PA,3P%,eFG%,TS%,PPG,RPG,APG,SPG,BPG,TPG,P+R,P+A,P+R+A,VI,ORtg,DRtg
2,,Bam Adebayo,Mia,C-F,25.7,75,34.6,25.3,14.4,402,0.806,1102,0.545,12,0.083,0.541,0.592,20.4,9.2,3.2,1.2,0.8,2.5,29.6,23.6,32.8,9.8,116.0,105.6


CLEANING

We can see that rank shows as invalid(NaN) for all players(if we uncomment line 19) , so lets drop rank from our dataset

We also can do further cleaning of our dataset. In this case, players whos stats cannot be calculated for some fields(NaN) likely have played not enough games, or have not gotten >0 in a certain stat. We can trim our dataset by either filtering players out who have played very few minutes/games(option 1), or we can just remove any player that has a null value for any stat(option 2), or input filler numbers for our NaN values(option 3). If you dont know exactly how some of the advanced stats are calculated, option 3 is risky, so we will focus on the first two options

In [256]:
df = df.drop('RANK', axis=1)
#show new shape, should have 1 less column than previously
df.shape

(609, 28)

In [257]:


#Option 1
#removes players who ahve played less than 10 games and average less than 
#minutes per game
option_1_df = df.loc[(df['GP']>10) & (df['MPG']>5)]
print(option_1_df.shape)

missing_values = option_1_df.isnull().sum()

print(missing_values)



(503, 28)
NAME     0
TEAM     0
POS      0
AGE      0
GP       0
MPG      0
USG%     0
TO%      0
FTA      0
FT%      0
2PA      0
2P%      0
3PA      0
3P%      0
eFG%     0
TS%      0
PPG      0
RPG      0
APG      0
SPG      0
BPG      0
TPG      0
P+R      0
P+A      0
P+R+A    0
VI       0
ORtg     0
DRtg     0
dtype: int64


We can see that we have trimmed 106 players from our list using the minutes/games restriction, and have in fact gotten
rid of all the players with missing values. However, this is likely overkill as some players have played less than 10 games and less than 5 minutes per game while having analyzable values.


In [259]:
#Option 2
#remove only players who have an invalid value
o2_df= df.loc[(df['ORtg'].notnull()) & (df['eFG%'].notnull())]
print(o2_df.shape)
#we can add in all the other or clauses, but these 2 do the trick due to how these stats overlap

missing_values = o2_df.isnull().sum()

print(missing_values)

(585, 28)
NAME     0
TEAM     0
POS      0
AGE      0
GP       0
MPG      0
USG%     0
TO%      0
FTA      0
FT%      0
2PA      0
2P%      0
3PA      0
3P%      0
eFG%     0
TS%      0
PPG      0
RPG      0
APG      0
SPG      0
BPG      0
TPG      0
P+R      0
P+A      0
P+R+A    0
VI       0
ORtg     0
DRtg     0
dtype: int64


Now we can see that we have elimnated only 24 players and still have all analyzable values

In [261]:
#I would like to see players who score over 20 points per game
#lets store this as a new dataframe and we can do some transformations
pts_over_20 = o2_df.loc[o2_df['PPG'] >= 20]
print(pts_over_20.shape)

#From this we can see that there are 60 players in the 2022-23 NBA season who averaged over 20 points each game



(60, 28)


To do some more advanced filtering: 
I want to separate elite shooters(1) from strong defensive
players(2), from those who are neither(3), or both(4). We are going to create a new field to tag our players with, and set some conditions for each category. 

In order to qualify as an elite shooter:
- over 50 games played(large sample size,82 games is max)
- score over 20 points per game(ppg)
- High efficiency(Free throw% over 80%, Field goal percentage over 35%, True shooting % over 55%)

In order to qualify as a strong defensive player:
- over 50 games played
- 107 or lower defensive rating(league average is 110, lower is better)
- 1 block or steal per game




In [263]:
elite_shooters = pts_over_20.loc[(pts_over_20['GP']>50) & (pts_over_20['TS%']> .55) 
                                 & (pts_over_20['eFG%'] >.35) & (pts_over_20['FT%']>.80)]
print(elite_shooters.shape)
elite_shooters

(31, 28)


Unnamed: 0,NAME,TEAM,POS,AGE,GP,MPG,USG%,TO%,FTA,FT%,2PA,2P%,3PA,3P%,eFG%,TS%,PPG,RPG,APG,SPG,BPG,TPG,P+R,P+A,P+R+A,VI,ORtg,DRtg
2,Bam Adebayo,Mia,C-F,25.7,75,34.6,25.3,14.4,402,0.806,1102,0.545,12,0.083,0.541,0.592,20.4,9.2,3.2,1.2,0.8,2.5,29.6,23.6,32.8,9.8,116.0,105.6
26,Desmond Bane,Mem,G,24.8,58,31.8,26.1,11.8,205,0.883,532,0.534,407,0.408,0.568,0.606,21.5,5.0,4.4,1.0,0.4,2.2,26.5,25.9,30.9,9.8,118.8,107.0
53,Bojan Bogdanovic,Det,F,34.0,59,32.1,25.9,13.3,303,0.884,529,0.539,353,0.411,0.57,0.627,21.6,3.8,2.6,0.6,0.1,2.3,25.4,24.2,27.9,7.4,117.7,117.5
57,Devin Booker,Pho,G,26.4,53,34.6,31.8,12.0,358,0.855,751,0.554,316,0.351,0.546,0.601,27.8,4.5,5.5,1.0,0.3,2.7,32.3,33.3,37.8,10.2,118.8,109.0
77,Jalen Brunson,Nyk,G,26.6,68,35.0,27.2,10.6,392,0.829,873,0.519,322,0.416,0.547,0.597,24.0,3.5,6.2,0.9,0.2,2.1,27.6,30.2,33.8,9.2,124.6,112.6
85,Jimmy Butler,Mia,F,33.6,64,33.4,25.6,10.2,555,0.85,785,0.564,103,0.35,0.56,0.647,22.9,5.8,5.3,1.8,0.3,1.6,28.7,28.2,34.0,10.7,135.9,111.3
102,Jordan Clarkson,Uta,G,30.8,61,32.6,27.8,15.3,245,0.816,572,0.53,459,0.338,0.519,0.558,20.8,4.0,4.4,0.5,0.2,3.0,24.9,25.3,29.3,8.8,108.6,115.8
116,Stephen Curry,Gol,G,35.1,56,34.7,31.0,13.6,281,0.915,494,0.579,639,0.427,0.614,0.656,29.4,6.1,6.3,0.9,0.4,3.2,35.5,35.7,41.8,12.0,123.4,112.1
125,DeMar DeRozan,Chi,G-F,33.7,74,36.2,27.8,10.5,523,0.872,1161,0.526,142,0.324,0.522,0.592,24.5,4.6,5.1,1.1,0.5,2.1,29.2,29.6,34.3,9.2,118.8,109.1
154,Joel Embiid,Phi,C-F,29.1,66,34.6,37.0,14.5,771,0.857,1128,0.587,200,0.33,0.573,0.655,33.1,10.2,4.2,1.0,1.7,3.4,43.2,37.2,47.4,13.0,124.4,104.1


We can see we have a pool of 32 players who meet our criteria for elite shooters

In [265]:
strong_defense = o2_df.loc[o2_df['GP']>20]
strong_defense = strong_defense.loc[(strong_defense['DRtg']<107) 
                                    & ((strong_defense['SPG']>=1)|(strong_defense['BPG']>=1))]
print(strong_defense.shape)
strong_defense

(47, 28)


Unnamed: 0,NAME,TEAM,POS,AGE,GP,MPG,USG%,TO%,FTA,FT%,2PA,2P%,3PA,3P%,eFG%,TS%,PPG,RPG,APG,SPG,BPG,TPG,P+R,P+A,P+R+A,VI,ORtg,DRtg
1,Steven Adams,Mem,C,29.7,42,27.0,14.6,23.1,129,0.364,262,0.599,1,0.0,0.597,0.564,8.6,11.5,2.3,0.9,1.1,1.9,20.1,10.9,22.5,9.0,118.1,103.2
2,Bam Adebayo,Mia,C-F,25.7,75,34.6,25.3,14.4,402,0.806,1102,0.545,12,0.083,0.541,0.592,20.4,9.2,3.2,1.2,0.8,2.5,29.6,23.6,32.8,9.8,116.0,105.6
8,Jarrett Allen,Cle,C,25.0,68,32.6,16.5,12.9,221,0.733,616,0.653,10,0.1,0.645,0.67,14.3,9.8,1.7,0.8,1.2,1.4,24.0,15.9,25.7,7.6,133.1,105.0
9,Jose Alvarado,Nor,G,25.0,61,21.5,19.8,14.2,80,0.813,242,0.488,247,0.336,0.496,0.525,9.0,2.3,3.0,1.1,0.2,1.3,11.3,12.1,14.4,7.4,106.6,106.0
23,Mo Bamba,Orl,C,24.9,40,17.0,16.3,9.3,51,0.686,106,0.594,108,0.398,0.596,0.613,7.3,4.6,1.1,0.3,1.0,0.6,11.9,8.3,12.9,7.8,125.3,102.7
45,Patrick Beverley,Chi,G,34.7,22,27.5,10.5,13.1,15,0.533,38,0.579,81,0.309,0.5,0.506,5.8,4.9,3.5,1.0,0.7,0.8,10.6,9.2,14.1,6.7,114.3,103.9
51,Bismack Biyombo,Pho,C,30.6,61,14.3,14.0,19.8,70,0.357,206,0.578,0,0.0,0.578,0.555,4.3,4.2,0.9,0.3,1.4,0.8,8.6,5.2,9.5,7.1,109.3,101.6
73,Jaylen Brown,Bos,G-F,26.5,67,35.9,31.5,12.5,344,0.765,896,0.576,487,0.335,0.55,0.581,26.6,6.9,3.5,1.1,0.4,2.9,33.5,30.1,36.9,9.6,109.8,106.5
94,Alex Caruso,Chi,G,29.1,67,23.5,11.1,21.2,73,0.808,135,0.556,151,0.364,0.551,0.588,5.6,2.9,2.9,1.5,0.7,1.1,8.5,8.5,11.4,6.1,115.1,104.0
103,Nic Claxton,Bro,F-C,24.0,76,29.9,15.5,14.1,246,0.541,585,0.708,2,0.0,0.705,0.691,12.6,9.2,1.9,0.9,2.5,1.3,21.9,14.5,23.8,8.1,133.4,105.2


We have 47 players who meet our criteria as strong defenders

Next lets move our criteria to value for fantasy games. Fantasy basketball for the league I am in uses the following modifiers: 

Points Scored:1 Points

Total Rebounds:1.2 Points

Assists:1.5 Points

Steals:3 Points

Blocked Shots:3 Points

Turnovers:-1 Points

Missed shot = -.3 Points

(in order to accurately calculate efficiency based on points scored/missed shots, we will use a weighted formula based on accuracy of 2 pointers, 3 pointers, and freethrows)

In [307]:
#Lets create a fantasy points category and append it to our dataframe, maxing out at two decimal points

from math import trunc

#truncate our values so that its rounded to 2 decimals
def truncate(num):
    num = trunc(num*100)/100
    return num
fp = []

for index, row in o2_df.iterrows():
    fanpoints = truncate((1.2 * row['RPG']) + (1.5 * row['APG']) +
                         (3*(row['SPG'] +row['BPG'])) - row['TPG'] + 
                         #formula calculating points scored and missed shots
                         (2*row['2P%']) +(3*row['3P%']) -(.3*(1-(row['2P%'])+(1-row['3P%']))) +(row['FTA']*row['FT%']/row['GP']))
    
    fp.append(fanpoints)


  


In [328]:
o2_df['FANPOINTS'] = fp
o2_df.iloc[0]
#o2_df[['NAME','FANPOINTS']].sort_values(by='FANPOINTS', ascending = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  o2_df['FANPOINTS'] = fp


NAME         Precious Achiuwa
TEAM                      Tor
POS                         F
AGE                      23.6
GP                         55
MPG                      20.7
USG%                     19.3
TO%                      12.7
FTA                       124
FT%                     0.702
2PA                       296
2P%                     0.564
3PA                       108
3P%                     0.269
eFG%                    0.521
TS%                     0.554
PPG                       9.2
RPG                       6.0
APG                       0.9
SPG                       0.6
BPG                       0.5
TPG                       1.1
P+R                      15.2
P+A                      10.1
P+R+A                    16.1
VI                        7.1
ORtg                    112.3
DRtg                    106.9
FANPOINTS               13.91
Name: 0, dtype: object

In [311]:
import pandas as pd

#raw data as a csv file
filepath = 'NBASTATS2023.csv' 

#display all rows/columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
rankings = pd.read_csv('data/FantasyPros_2023_Overall_NBA_ADP_Rankings.csv')

In [341]:
print(rankings.shape)
rankings.iloc[1]
rankings[['NAME','AVG']].sort_values(by='AVG', ascending = True)

(257, 8)


<bound method NDFrame.head of                          NAME    AVG
0                Nikola Jokic    1.0
1                 Luka Doncic    2.7
2       Giannis Antetokounmpo    3.3
3                 Joel Embiid    4.0
4                Jayson Tatum    4.7
5     Shai Gilgeous-Alexander    5.3
6               Stephen Curry    8.0
7           Tyrese Haliburton    8.3
8              Damian Lillard    9.3
9                Kevin Durant   10.7
10           Domantas Sabonis   12.0
11              Anthony Davis   14.0
12            Anthony Edwards   14.3
13                 Trae Young   14.3
14                LaMelo Ball   15.3
15               Devin Booker   15.7
16               LeBron James   16.3
17           Donovan Mitchell   19.0
18          Victor Wembanyama   20.0
19               Kyrie Irving   21.3
20              Mikal Bridges   21.7
21              Pascal Siakam   24.0
22               Jimmy Butler   25.0
23         Karl-Anthony Towns   26.7
24               De'Aaron Fox   29.0
25      

This is a table that contains the average pick each player is taken in fantasy drafts across different websites.

In fantasy basketball, higher average draft pick means that the player has higher value when you are picking(opponent may select). Lets combine our rankings and stats dataframes and create a new column for weighted fantasy value(Fantasy points per game /(1.002 ** AVG draft pick), because a earlier avg draft pick with same points is slightly higher priority.

We will do an inner join on the name of the player to combine our data from our two tables, and then add our final column, WFPG(Weighted Fan Points/Game)

In [338]:
weighted = o2_df.copy(deep = True)
print(weighted.shape)
weightedFP = []
rankings = rankings.rename(columns = {'Player':'NAME'})
weighted = weighted.drop_duplicates('NAME')
rankings = rankings.drop_duplicates('NAME')
#print(rankings.head)
result = rankings.merge(weighted, how = 'inner', on = ['NAME'])
print(result.shape)
#208 overlapping players
#print(result.iloc[0])


for index,row in result.iterrows():
    wfpg = ((row['GP']/82)*(row['FANPOINTS']))/(1.002**row['AVG'])
    weightedFP.append(wfpg)
    
result['WFPG'] = weightedFP
result.shape   
result[['NAME','WFPG','GP']].sort_values(by='WFPG', ascending = False)

(585, 29)
(217, 36)


Unnamed: 0,NAME,WFPG,GP
0,Nikola Jokic,32.264495,69
10,Domantas Sabonis,30.964239,79
3,Joel Embiid,28.018351,66
1,Luka Doncic,27.298678,66
4,Jayson Tatum,25.899303,74
2,Giannis Antetokounmpo,25.332058,63
5,Shai Gilgeous-Alexander,25.108315,68
13,Trae Young,24.00842,73
11,Anthony Davis,22.532529,56
29,Nikola Vucevic,22.473871,82


Our rankings data contains 258 players, and not all of them match the data in our previous dataframe so in this case where our data is mismatched size, we will compute weighted fan point values only for those players that are contained in both datasets(217).  

The index for each player shows their original placement based on the rankings data(0 based indexing), we can see how rankings have shifted around based on our league's point calculations

We have now merged our dataframes, and created an effective ranking system for my fantasy draft!
