In [1]:
import pandas as pd
import unidecode
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re

### This exercise uses xgboost to build a decision tree predicting which players should've been selected for the 2021 NBA All-Star game. I'm using xgboost to experiment with how it does with a lot of variables

#### Load excel tables of player stats by season. Then, make a list of the players who made the all-star game that year, and add a dummy variable to each row indicating if the player made the team.

#### 2015 - 2016 Season

In [2]:
nba_16 = pd.read_excel(r'NBA 15-16.xlsx')
nba_16.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Quincy Acy\acyqu01,PF,25,SAC,59,29,14.8,2.0,3.6,...,0.735,1.1,2.1,3.2,0.5,0.5,0.4,0.5,1.7,5.2
1,2,Jordan Adams\adamsjo01,SG,21,MEM,2,0,7.5,1.0,3.0,...,0.6,0.0,1.0,1.0,1.5,1.5,0.0,1.0,1.0,3.5
2,3,Steven Adams\adamsst01,C,22,OKC,80,80,25.2,3.3,5.3,...,0.582,2.7,3.9,6.7,0.8,0.5,1.1,1.1,2.8,8.0
3,4,Arron Afflalo\afflaar01,SG,30,NYK,71,57,33.4,5.0,11.3,...,0.84,0.3,3.4,3.7,2.0,0.4,0.1,1.2,2.0,12.8
4,5,Alexis Ajinça\ajincal01,C,27,NOP,59,17,14.6,2.5,5.3,...,0.839,1.3,3.3,4.6,0.5,0.3,0.6,0.9,2.3,6.0


In [3]:
all_stars_16 = ['DeMar DeRozan','Lebron James', 'Isaiah Thomas', 'Dwyane Wade','Jimmy Butler','Chris Bosh',\
'John Wall','Andre Drummond','Al Horford','Carmelo Anthony', 'Kyle Lowry','Paul George','Paul Millsap', \
'Stephen Curry','James Harden','Kobe Bryant','Lamarcus Aldridge','Kevin Durant','Kawhi Leonard','Anthony Davis','Russell Westbrook', \
'Klay Thompson','Draymond Green','DeMarcus Cousins','Pau Gasol']
#lowercase all names
all_stars_16 = [x.lower() for x in all_stars_16]
#get rid of accents in names
all_stars_16 = [unidecode.unidecode(x) for x in all_stars_16]

nba_16['allstar'] = nba_16['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_16)) else 0)

#### 2016 - 2017 Season

In [4]:
nba_17 = pd.read_excel(r'NBA 16-17.xlsx')

all_stars_17 = ['Kyrie Irving', 'DeMar DeRozan','Lebron James', 'jimmy Butler','Giannis Antetokounmpo','Isaiah Thomas', \
'John Wall','Kevin love','Carmelo Anthony', 'Kyle Lowry','Paul George','Kemba Walker','Paul Millsap', \
'Stephen Curry','James Harden','Kevin Durant','Kawhi Leonard','Anthony Davis','Russell Westbrook', \
'Klay Thompson','Draymond Green','DeMarcus Cousins','Marc Gasol',' Deandre Jordan','Gordon Hayward']

all_stars_17 = [x.lower() for x in all_stars_17]
all_stars_17 = [unidecode.unidecode(x) for x in all_stars_17]
nba_17['allstar'] = nba_17['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_17)) else 0)

#### 2017 - 2018 Season

In [5]:
nba_18 = pd.read_excel(r'NBA 17-18.xlsx')

all_stars_18 = ['Kemba Walker','Kyrie Irving','Giannis Antetokounmpo','Joel Embiid','Demar Derozan',\
'Goran Dragic','Al Horford','Kevin Love','Kyle Lowry','Victor Oladipo','John Wall','Bradley Beal',\
'Kristaps Porzingis','Andre Drummond','Stephen Curry', 'James Harden','Kevin Durant','Demarcus Cousins',\
'Paul George','Lebron James','Russell Westbrook','Damian Lillard','Klay Thompson','Anthony Davis',\
    'LaMarcus Aldrige','Draymond Green','Karl-Anthony Towns','Jimmy Butler']
    
all_stars_18 = [x.lower() for x in all_stars_18]
all_stars_18 = [unidecode.unidecode(x) for x in all_stars_18]

nba_18['allstar'] = nba_18['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_18)) else 0)

#### 2018 - 2019 Season

In [6]:
nba_19 = pd.read_excel(r'NBA 18-19.xlsx')

all_stars_19 = ['Kemba Walker','Kyrie Irving','Kawhi Leonard','Giannis Antetokounmpo','Joel Embiid',\
'Kyle Lowry','Victor Oladipo','Khris Middleton','Bradley Beal','Ben Simmons','Blake Griffin',\
'Nikola Vučević', 'Dwyane Wade','D''Angelo Russell','Stephen Curry', 'James Harden','Kevin Durant',\
'Paul George','Lebron James','Russell Westbrook','Damian Lillard','Klay Thompson','Anthony Davis',\
    'LaMarcus Aldrige','Nikola Jokic','Karl-Anthony Towns','Dirk Nowitzki']

all_stars_19 = [x.lower() for x in all_stars_19]
all_stars_19 = [unidecode.unidecode(x) for x in all_stars_19]

nba_19['allstar'] = nba_19['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_19)) else 0)

#### 2019 - 2020 Season

In [7]:
nba_20 = pd.read_excel(r'NBA 19-20.xlsx')

all_stars_20 = ['Kemba Walker','Kyrie Irving','Kawhi Leonard','Giannis Antetokounmpo','Joel Embiid',\
'Kyle Lowry','Victor Oladipo','Khris Middleton','Bradley Beal','Ben Simmons','Blake Griffin',\
'Nikola Vučević', 'Dwyane Wade','D''Angelo Russell','Stephen Curry', 'James Harden','Kevin Durant',\
'Paul George','Lebron James','Russell Westbrook','Damian Lillard','Klay Thompson','Anthony Davis',\
    'LaMarcus Aldrige','Nikola Jokic','Karl-Anthony Towns','Dirk Nowitzki']

all_stars_20 = [x.lower() for x in all_stars_20]
all_stars_20 = [unidecode.unidecode(x) for x in all_stars_20]

nba_20['allstar'] = nba_20['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_20)) else 0)

#### Merge

In [8]:
nba = nba_16.append([nba_17,nba_18,nba_19,nba_20])
nba.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,allstar
0,1,Quincy Acy\acyqu01,PF,25,SAC,59,29,14.8,2.0,3.6,...,1.1,2.1,3.2,0.5,0.5,0.4,0.5,1.7,5.2,0
1,2,Jordan Adams\adamsjo01,SG,21,MEM,2,0,7.5,1.0,3.0,...,0.0,1.0,1.0,1.5,1.5,0.0,1.0,1.0,3.5,0
2,3,Steven Adams\adamsst01,C,22,OKC,80,80,25.2,3.3,5.3,...,2.7,3.9,6.7,0.8,0.5,1.1,1.1,2.8,8.0,0
3,4,Arron Afflalo\afflaar01,SG,30,NYK,71,57,33.4,5.0,11.3,...,0.3,3.4,3.7,2.0,0.4,0.1,1.2,2.0,12.8,0
4,5,Alexis Ajinça\ajincal01,C,27,NOP,59,17,14.6,2.5,5.3,...,1.3,3.3,4.6,0.5,0.3,0.6,0.9,2.3,6.0,0


#### Check All-Stars

In [9]:
nba[nba.allstar==1]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,allstar
6,7,LaMarcus Aldridge\aldrila01,PF,30,SAS,74,74,30.6,7.2,14.1,...,2.4,6.2,8.5,1.5,0.5,1.1,1.3,2.0,18.0,1
22,21,Carmelo Anthony\anthoca01,SF,31,NYK,72,72,35.1,7.9,18.2,...,1.4,6.4,7.7,4.2,0.9,0.5,2.4,2.5,21.8,1
61,58,Chris Bosh\boshch01,PF,31,MIA,53,53,33.5,6.8,14.5,...,0.9,6.5,7.4,2.4,0.7,0.6,1.5,1.9,19.1,1
69,66,Kobe Bryant\bryanko01,SF,37,LAL,66,66,28.2,6.0,16.9,...,0.6,3.1,3.7,2.8,0.9,0.2,2.0,1.7,17.6,1
77,72,Jimmy Butler\butleji01,SG,26,CHI,67,67,36.9,7.0,15.4,...,1.2,4.2,5.3,4.8,1.6,0.6,2.0,1.9,20.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,294,Kawhi Leonard,SF,28,LAC,57,57,1848.0,532.0,1133.0,...,54.0,348.0,402.0,280.0,103.0,33.0,149.0,113.0,1543.0,1
376,297,Damian Lillard,PG,29,POR,66,66,2474.0,624.0,1349.0,...,33.0,251.0,284.0,530.0,70.0,22.0,194.0,114.0,1978.0,1
382,303,Kyle Lowry,PG,33,TOR,58,58,2098.0,334.0,803.0,...,32.0,260.0,292.0,433.0,82.0,26.0,178.0,190.0,1126.0,1
418,336,Khris Middleton,SF,28,MIL,62,59,1853.0,471.0,947.0,...,44.0,338.0,382.0,265.0,53.0,7.0,137.0,142.0,1297.0,1


#### You can see the names of the players have codes, which we dont want. Lets Clean the player column so that it only has the names to the left of the slash

In [10]:
nba['Index'] = nba['Player'].str.find('\\')
nba['Player_clean'] = nba.apply(lambda x: x['Player'][0:x['Index']], axis=1)
print(nba[['Player', 'Player_clean']].head())

                    Player   Player_clean
0       Quincy Acy\acyqu01     Quincy Acy
1   Jordan Adams\adamsjo01   Jordan Adams
2   Steven Adams\adamsst01   Steven Adams
3  Arron Afflalo\afflaar01  Arron Afflalo
4  Alexis Ajinça\ajincal01  Alexis Ajinça


#### Now, let's get rid of the accents

In [11]:
nba['Player_clean'] = nba['Player_clean'].apply(lambda x: unidecode.unidecode(x))
print(nba[['Player', 'Player_clean']].sample(10))

                       Player     Player_clean
548    Dion Waiters\waitedi01     Dion Waiters
256          Tim Hardaway Jr.  Tim Hardaway Jr
134             Chris Clemons     Chris Clemon
320           Amile Jefferson   Amile Jefferso
628  Caleb Swanigan\swanica01   Caleb Swanigan
75       Alec Burks\burksal01       Alec Burks
637    Lou Williams\willilo02     Lou Williams
115       Ian Clark\clarkia01        Ian Clark
181     Luka Dončić\doncilu01      Luka Doncic
490             Elfrid Payton     Elfrid Payto


## 2021 Season

#### Make a list of the names of the 2021 all-stars (info I got from Wikipedia): this will just be for reference, since we'll be trying to predict who should've been a 2021 all-star

In [12]:
all_stars_21 = ['Kyrie Irving','Kawhi Leonard','Giannis Antetokounmpo','Joel Embiid',\
'Bradley Beal','Ben Simmons','Nikola Vučević', 'Stephen Curry', 'James Harden','Kevin Durant',\
'Paul George','LeBron James','Damian Lillard','Anthony Davis','Jaylen Brown','Zach LaVine','Julius Randle',\
'Domantas Sabonis','Jayson Tatum','Luka Doncic','Devin Booker','Mike Conley Jr.','Donovan Mitchell','Chris Paul',\
 'Zion Williamson','Rudy Gobert','Nikola Jokic']

all_stars_21 = [x.lower() for x in all_stars_21]
all_stars_21 = [unidecode.unidecode(x) for x in all_stars_21]

In [13]:
nba_21 = pd.read_excel(r'NBA 20-21.xlsx')
nba_21['Index'] = nba_21['Player'].str.find('\\')
nba_21['Player_clean'] = nba_21.apply(lambda x: x['Player'][0:x['Index']], axis=1)
nba_21['Player_clean'] = nba_21['Player_clean'].apply(lambda x: unidecode.unidecode(x))

nba_21['allstar'] =nba_21['Player'].apply(lambda x: 1 if (any(s.upper() in x.upper() for s in all_stars_21)) else 0)

#### Let's check to see if it worked: we'll view the whole row of Lebron James, who was named an all-star. If it worked, then his "allstar" column should be 1

In [14]:
print(nba[nba['Player_clean']=='LeBron James'])

      Rk                  Player Pos  Age   Tm   G  GS    MP    FG   FGA  ...  \
264  221  LeBron James\jamesle01  SF   31  CLE  76  76  35.6   9.7  18.6  ...   
269  220  LeBron James\jamesle01  SF   32  CLE  74  74  37.8   9.9  18.2  ...   
303  248  LeBron James\jamesle01  PF   33  CLE  82  82  36.9  10.5  19.3  ...   
332  258  LeBron James\jamesle01  SF   34  LAL  55  55  35.2  10.1  19.9  ...   

     TRB  AST  STL  BLK  TOV   PF   PTS  allstar  Index  Player_clean  
264  7.4  6.8  1.4  0.6  3.3  1.9  25.3        1     12  LeBron James  
269  8.6  8.7  1.2  0.6  4.1  1.8  26.4        1     12  LeBron James  
303  8.6  9.1  1.4  0.9  4.2  1.7  27.5        1     12  LeBron James  
332  8.5  8.3  1.3  0.6  3.6  1.7  27.4        1     12  LeBron James  

[4 rows x 33 columns]


In [15]:
#fill nas with 0 so we can train the model
nba['allstar'] = nba.allstar.fillna(0)

## Train

#### Now, lets drop columns that we don't want, and define our x variables and y variable

In [16]:
df = nba[['Age','G', 'GS', 'MP', 'FG%', '3P%', 'eFG%', 'FTA', 'FT%','TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS','allstar']]

X = df[['Age','G', 'GS', 'MP', 'FG%', '3P%', 'eFG%', 'FTA', 'FT%','TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
Y = df[['allstar']]

#### Split data into train and test sets: test size will be 33%

In [17]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Fit model to our data

In [18]:
model = XGBClassifier()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

#### Make predictions!

In [19]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

#### Evaluate said predictions

In [20]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 98.21%


#### Looks like an accurate model, wow! Before we get too excited, let's test it with new data: the 2021 season

In [21]:
#2020
print(nba_21.head())

   Rk                       Player Pos  Age   Tm   G  GS    MP   FG   FGA  \
0   1   Precious Achiuwa\achiupr01  PF   21  MIA  38   2  13.4  2.3   3.9   
1   2       Jaylen Adams\adamsja01  PG   24  MIL   7   0   2.6  0.1   1.1   
2   3       Steven Adams\adamsst01   C   27  NOP  36  36  27.4  3.6   5.7   
3   4        Bam Adebayo\adebaba01   C   23  MIA  33  33  33.9  7.2  12.7   
4   5  LaMarcus Aldridge\aldrila01   C   35  SAS  21  18  25.9  5.5  11.8   

   ...  TRB  AST  STL  BLK  TOV   PF   PTS  Index       Player_clean  allstar  
0  ...  3.8  0.5  0.4  0.5  0.9  1.6   5.7     16   Precious Achiuwa        0  
1  ...  0.4  0.3  0.0  0.0  0.0  0.1   0.3     12       Jaylen Adams        0  
2  ...  9.2  2.0  0.9  0.6  1.5  1.8   8.3     12       Steven Adams        0  
3  ...  9.5  5.4  0.9  1.0  3.0  2.4  19.2     11        Bam Adebayo        0  
4  ...  4.5  1.7  0.4  0.9  1.0  1.7  13.7     17  LaMarcus Aldridge        0  

[5 rows x 33 columns]


#### First, lets see who the model predicts will be an all-star

In [22]:
X_21 = nba_21[['Age','G', 'GS', 'MP', 'FG%', '3P%', 'eFG%', 'FTA', 'FT%','TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]

allstar_predictions_21 = model.predict(X_21)
nba_21['predict'] = allstar_predictions_21
print(nba_21[nba_21.predict==1])

      Rk                   Player Pos  Age   Tm   G  GS    MP   FG   FGA  ...  \
79    78   Jimmy Butler\butleji01  SF   31  MIA  25  25  33.4  6.8  14.6  ...   
112  109  Stephen Curry\curryst01  PG   32  GSW  37  37  34.0  9.7  20.4  ...   
132  129   Kevin Durant\duranke01  PF   32  BRK  19  18  35.7  9.9  18.9  ...   
190  185   James Harden\hardeja01  PG   31  BRK  25  25  38.4  8.2  16.7  ...   

      AST  STL  BLK  TOV   PF   PTS  Index   Player_clean  allstar  predict  
79    8.0  2.2  0.4  2.2  1.2  21.4     12   Jimmy Butler        0        1  
112   6.3  1.3  0.1  3.1  1.8  29.3     13  Stephen Curry        1        1  
132   5.3  0.7  1.4  3.5  2.3  29.0     12   Kevin Durant        1        1  
190  11.2  1.3  0.8  4.3  2.4  25.3     12   James Harden        1        1  

[4 rows x 34 columns]


#### It only predicts four players will be all-stars, and one (Jimmy Butler) wasn't named one!

### Why is our model performing poorly?

#### For one, we're not being selective with variables: since the all-star game just occurred and I don't want the season's second half data to influence who gets the all-star nod, I'm being lazy with variables. To build a better model, we should look at an AUC curve to see which variables are most important.

#### Second, our data could use some resampling: we'd likely see better results if we had more all-star observations in the dataset. 

### For now, let's rely on the predict_proba attribute of our model: it'll rank the top players who should be an all-star. The probabilities will likely be low, but it'll give us a rough hierarchy of who (the model thinks) should be selected 

In [23]:
proba_21 = model.predict_proba(X_21)
nba_21['proba'] = proba_21.T[1]
print(nba_21[['Player_clean','allstar','proba']].sort_values(by=['proba','allstar'], ascending=False)[:50])

                Player_clean  allstar     proba
190             James Harden        1  0.681808
132             Kevin Durant        1  0.672264
79              Jimmy Butler        0  0.535551
112            Stephen Curry        1  0.520354
236             LeBron James        1  0.449985
486           Nikola Vucevic        1  0.449899
188             James Harden        1  0.426362
138              Joel Embiid        1  0.410265
279            Kawhi Leonard        1  0.391306
285           Damian Lillard        1  0.318925
491                John Wall        0  0.280085
189             James Harden        1  0.214133
38              Bradley Beal        1  0.160426
13     Giannis Antetokounmpo        1  0.152935
499        Russell Westbrook        0  0.134119
163              Paul George        1  0.105345
114            Anthony Davis        1  0.087201
166  Shai Gilgeous-Alexander        0  0.076169
197            Tobias Harris        0  0.066939
475       Karl-Anthony Towns        0  0

### First thing I notice: the probabilities quickly drop off. As we know, this means our model could be MUCH stronger, but using predict_proba will still give us that good hierarchy of who is most deserving. Since at the end of the day, the top players--regardless of if they're on average performing better or worse than prior years--will be selected, I'll accept this method.

### Second, it looks like our model thinks Jimmy Butler, John Wall, and Russell Westbrook we're snubbed, while it thinks Zion Williamson & Donovan Mitchell weren't deserving.

#### What our model doesn't consider is win percentage: the teams that Butler, Wall, and Westbrook play for--the Heat, Rockets, and Wizards--have all struggled at some point this season. That may be a factor.

#### Another interesting observation (especially for Philly fans): our model thinks Tobias Harris is more deserving of a nod than Ben Simmons. 

#### Our model also doesn't consider intangibles. While Zion's having a great season, it's hard to measure the boost he gets from his flashy highlights (like his emphatic dunks).