In [49]:
import pandas as pd
import numpy as np
import random

In [2]:
season_stats = pd.read_csv('Seasons_Stats.csv')
all_stars = pd.read_csv('allstars.csv')

## Season_stats comes from https://www.kaggle.com/drgilermo/nba-players-stats 

In [3]:
season_stats.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


# all star data comes from https://data.world/datasets/nba and includes 2000-2016

In [4]:
all_stars.head()

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,2016.0,Stephen Curry,G,6-3,190.0,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States,,...,,,,,,,,,,
1,2016.0,James Harden,SG,6-5,220.0,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States,,...,,,,,,,,,,
2,2016.0,Kevin Durant,SF,6-9,240.0,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States,,...,,,,,,,,,,
3,2016.0,Kawhi Leonard,F,6-7,230.0,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States,,...,,,,,,,,,,
4,2016.0,Anthony Davis,PF,6-11,253.0,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States,,...,,,,,,,,,,


In [5]:
[season_stats.shape, all_stars.shape]

[(24691, 53), (998, 25)]

In [6]:
all_stars.columns

Index(['Year', 'Player', 'Pos', 'HT', 'WT', 'Team', 'Selection Type',
       'NBA Draft Status', 'Nationality', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')

In [7]:
# Some extra columns were accidentally brought over in excel conversion. Programmatically get rid of them
real_cols = [s for s in all_stars.columns if 'Unnamed' not in s]

In [8]:
all_stars = all_stars[real_cols] #get rid of unnecessary columns

In [9]:
all_stars = all_stars[all_stars.Player.notnull()]

In [10]:
all_stars['Player']

0              Stephen Curry
1               James Harden
2               Kevin Durant
3              Kawhi Leonard
4              Anthony Davis
5          Russell Westbrook
6           DeMarcus Cousins
7              Klay Thompson
8             Draymond Green
9                 Marc Gasol
10            DeAndre Jordan
11            Gordon Hayward
12              Kyrie Irving
13             DeMar DeRozan
14              LeBron James
15     Giannis Antetokounmpo
16              Jimmy Butler
17             Isaiah Thomas
18                 John Wall
19                Kevin Love
20                Kyle Lowry
21               Paul George
22              Kemba Walker
23              Paul Millsap
24              LeBron James
25               Dwyane Wade
26               Paul George
27           Carmelo Anthony
28                Kyle Lowry
29              Jimmy Butler
               ...          
409             Chris Webber
410              Elton Brand
411            Allen Iverson
412           

In [11]:
all_star_seaons = season_stats.merge(all_stars, on = ['Player', 'Year'], how = 'left')

In [12]:
all_star_seaons.shape

(24691, 60)

In [13]:
all_star_seaons.tail()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos_x,Age,Tm,G,GS,MP,PER,...,TOV,PF,PTS,Pos_y,HT,WT,Team,Selection Type,NBA Draft Status,Nationality
24686,24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,...,65.0,189.0,639.0,,,,,,,
24687,24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,...,20.0,61.0,178.0,,,,,,,
24688,24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,...,3.0,17.0,23.0,,,,,,,
24689,24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,...,40.0,78.0,240.0,,,,,,,
24690,24690,2017.0,Ivica Zubac,C,19.0,LAL,38.0,11.0,609.0,17.0,...,30.0,66.0,284.0,,,,,,,


In [14]:
#using Team != NaN as factor for whether season stats merged with all star
all_star_seaons['AllStar']= pd.notnull(all_star_seaons.Team) 

In [15]:
#Because we have statistics for all players from 1960- and all star data from only 2000-, let's limit the set
all_star_seasons_select = all_star_seaons[all_star_seaons.Year > 1999]

In [16]:
all_star_seasons_select.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos_x', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2',
       'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Pos_y', 'HT', 'WT', 'Team',
       'Selection Type', 'NBA Draft Status', 'Nationality', 'AllStar'],
      dtype='object')

In [20]:
all_star_seasons_select['PPG'] = all_star_seasons_select.PTS / all_star_seasons_select.G

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
features = ['Age', 'PER', 'TS%', '3PAr', 'DBPM', 'FG%', '3P', '3P%', 'TRB', 'AST', 'STL', 'BLK', 'PPG']

In [22]:
all_star_features = all_star_seasons_select[features].fillna(0)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [26]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state= 100)

In [27]:
rf_model.fit(all_star_features, all_star_seasons_select['AllStar'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=100, verbose=0, warm_start=False)

In [29]:
for i in range(0, len(features)):
    print(features[i] + ": " + str(rf_model.feature_importances_[i]))

Age: 0.006720906978852828
PER: 0.29325494314898526
TS%: 0.026331158862596054
3PAr: 0.025356182138057803
DBPM: 0.011404477068328032
FG%: 0.020342949329441674
3P: 0.024272412512572596
3P%: 0.00850184026797669
TRB: 0.06778895402057385
AST: 0.09030736611483116
STL: 0.06293240107447573
BLK: 0.038118586622755427
PPG: 0.324667821860553


In [53]:
sorted_feats = rf_model.feature_importances_

In [56]:
type(rf_model.feature_importances_)

numpy.ndarray

In [30]:
rf_model.score(all_star_features, all_star_seasons_select['AllStar'])

0.9692277538220305

In [31]:
all_star_predictions = rf_model.predict(all_star_features)

In [33]:
all_star_seasons_select.reset_index(drop=True, inplace=True)

In [34]:
results = pd.concat([pd.Series(all_star_predictions), pd.Series(all_star_seasons_select['AllStar'])], axis=1, ignore_index=True)

In [35]:
results.columns = ['Predicted', 'Real']
results.groupby(["Predicted", "Real"]).size().reset_index()

Unnamed: 0,Predicted,Real,0
0,False,False,9750
1,False,True,278
2,True,False,36
3,True,True,140


In [36]:
precision = 141 / (141 + 36)
recall = 141 / (141 + 277)

In [37]:
[precision, recall]

[0.7966101694915254, 0.3373205741626794]

In [38]:
#explore some of the wrong ones
all_star_seasons_select[(results.Predicted == False) & (results.Real == True)]

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos_x,Age,Tm,G,GS,MP,PER,...,PTS,Pos_y,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,AllStar,PPG
5,14475,2000.0,Ray Allen,SG,24.0,MIL,82.0,82.0,3070.0,20.6,...,1809.0,SG,6-5,205.0,Milwaukee Bucks,Eastern All-Star Coaches Selection,1996 Rnd 1 Pick 5,United States,True,22.060976
118,14588,2000.0,Antonio Davis,C,31.0,TOR,79.0,78.0,2479.0,15.1,...,910.0,FC,6-9,245.0,Toronto Raptors,Eastern All-Star Replacement Selection,1990 Rnd 2 Pick 18,United States,True,11.518987
132,14602,2000.0,Vlade Divac,C,31.0,SAC,82.0,81.0,2374.0,18.0,...,1005.0,C,7-1,260.0,Sacramento Kings,Western All-Star Replacement Selection,1989 Rnd 1 Pick 26,Serbia,True,12.256098
152,14622,2000.0,Michael Finley,SF,26.0,DAL,82.0,82.0,3464.0,19.2,...,1855.0,G-F,6-7,225.0,Dallas Mavericks,Western All-Star Coaches Selection,1995 Rnd 1 Pick 21,United States,True,22.621951
207,14677,2000.0,Allan Houston,SG,28.0,NYK,82.0,82.0,3169.0,16.5,...,1614.0,G,6-6,205.0,New York Knicks,Eastern All-Star Coaches Selection,1993 Rnd 1 Pick 11,United States,True,19.682927
251,14721,2000.0,Jason Kidd,PG,26.0,PHO,67.0,67.0,2616.0,18.4,...,959.0,PG,6-4,210.0,Phoenix Suns,Western All-Star Fan Vote Selection,1994 Rnd 1 Pick 2,United States,True,14.313433
286,14756,2000.0,Stephon Marbury,PG,22.0,NJN,74.0,74.0,2881.0,20.7,...,1640.0,G,6-2,180.0,New Jersey Nets,Eastern All-Star Coaches Selection,1996 Rnd 1 Pick 4,United States,True,22.162162
293,14763,2000.0,Anthony Mason,SF,33.0,CHH,82.0,81.0,3133.0,14.4,...,948.0,PF,6-7,250.0,Miami Heat,Eastern All-Star Coaches Selection,1988 Rnd 3 Pick 3,United States,True,11.560976
299,14769,2000.0,Antonio McDyess,PF,25.0,DEN,81.0,81.0,2698.0,19.6,...,1551.0,F,6-9,245.0,Denver Nuggets,Western All-Star Coaches Selection,1995 Rnd 1 Pick 2,United States,True,19.148148
300,14770,2000.0,Tracy McGrady,SF,20.0,TOR,79.0,34.0,2462.0,20.0,...,1213.0,GF,6-8,210.0,Orlando Magic,Eastern All-Star Fan Vote Selection,1997 Rnd 1 Pick 9,United States,True,15.354430


In [39]:
all_star_seasons_select.Pos_x.value_counts()

PF       2080
PG       2073
C        2035
SG       1974
SF       1887
PG-SG      25
C-PF       23
SF-SG      18
SG-SF      18
SG-PG      18
PF-SF      17
SF-PF      16
PF-C       15
SG-PF       3
PG-SF       1
C-SF        1
Name: Pos_x, dtype: int64

In [40]:
position_dict = {'C-SF': 'SF', 'PG-SG': 'PG', 'C-PF': 'C', 'SG-PG': 'SG', 'SG-SF': 'SG',
                'SF-SG': 'SF', 'PF-SF': 'PF', 'SF-PF': 'SF', 'PF-C': 'PF', 'SG-PF': 'SG', 'PG-SF': 'PG'}

In [41]:
all_star_seasons_select['Pos1'] = all_star_seasons_select.Pos_x.map(position_dict).fillna(all_star_seasons_select.Pos_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Create new dataframe with only 5 positions

In [42]:
all_star_features2 = all_star_features
all_star_features2['Pos'] = all_star_seasons_select.Pos1

In [43]:
all_star_features2 = pd.get_dummies(all_star_features2, columns = ['Pos'])

In [44]:
rf_position = RandomForestClassifier(n_estimators=100, max_depth=4, random_state= 100)
rf_position.fit(all_star_features2, all_star_seasons_select['AllStar'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=100, verbose=0, warm_start=False)

In [45]:
for i in range(0, all_star_features2.shape[1]):
    print(all_star_features2.columns[i] + ": " + str(rf_position.feature_importances_[i]))

Age: 0.006720906978852828
PER: 0.29325494314898526
TS%: 0.026331158862596054
3PAr: 0.025356182138057803
DBPM: 0.011404477068328032
FG%: 0.020342949329441674
3P: 0.024272412512572596
3P%: 0.00850184026797669
TRB: 0.06778895402057385
AST: 0.09030736611483116
STL: 0.06293240107447573
BLK: 0.038118586622755427
PPG: 0.324667821860553


In [46]:
all_star_predictions2 = rf_position.predict(all_star_features2)

In [47]:
results2 = pd.concat([pd.Series(all_star_predictions2), pd.Series(all_star_seasons_select['AllStar'])], axis=1, ignore_index=True)
results2.columns = ['Predicted', 'Real']
results2.groupby(["Predicted", "Real"]).size().reset_index()

Unnamed: 0,Predicted,Real,0
0,False,False,9750
1,False,True,278
2,True,False,36
3,True,True,140


In [50]:
random.seed()