In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import seaborn as sns
class LDA():
    def __init__(self, dataset, class_var, priors = None):
        n_class = len(dataset[class_var].unique())
        if priors is None:
            priors = np.repeat(1/n_class, n_class)
        self.priors =  np.asarray(priors)
        self.means = dataset.groupby(class_var).mean()
        self.sigma = dataset.cov()
        self.class_var = class_var
        self.training_data = dataset
    def predict_probs(self, data = None):
        if data is None:
            data = self.training_data
        data_temp = data.drop(self.class_var, axis = 1)
        dens_list = []
        col_names = []
        for ind, row in self.means.iterrows():
            col_names.append(ind)
            dens_list.append(multivariate_normal.pdf(data_temp, mean = np.asarray(row), cov = self.sigma))
        dens_list = pd.DataFrame(np.transpose(np.vstack(dens_list)),columns= col_names)
        dens_list = dens_list.mul(self.priors, axis=1)
        dens_list = dens_list.div(dens_list.sum(axis=1), axis=0)
        ## Change to accept values due to index mismatch error
        dens_list['True Class'] = data[self.class_var].values
        return dens_list 
    def predict_MAP(self, data = None):        
        if data is None:
            data = self.training_data
        dens_list = self.predict_probs(data).drop('True Class', axis = 1)
        map_list = dens_list.idxmax(axis = 1)
        maps = {'MAP Class': map_list}
        maps = pd.DataFrame(maps)
        maps['True Class'] = data[self.class_var].values
        return maps
    def misclass_rate(self, data = None):
        if data is None:
            data = self.training_data
        maps = self.predict_MAP(data = data)
        
        
        maps['Mis_class'] = maps['MAP Class']  == maps['True Class']
        
        mis_class =  1 - maps['Mis_class'].mean()
        
        return mis_class
    def misclass_xtabs(self, data = None):
        if data is None:
            data = self.training_data
        maps = self.predict_MAP(data = data)
        
        xtabs = pd.crosstab(maps['MAP Class'], maps['True Class'])        
        return xtabs
    def misclass_pairplot(self, data = None):
        if data is None:
            data = self.training_data
        maps = self.predict_MAP(data = data)
        temp_dat = data.copy(deep = True)
        temp_dat['Mis-Classified'] = maps['MAP Class']  != maps['True Class']
        plot = sns.pairplot(temp_dat,hue="Mis-Classified", height = 1.5, aspect = 1.5)   
        return plot

In [2]:
full = pd.read_csv("Data/full_data.csv")
full.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,All_NBA_Pos,All_NBA_Team,All_NBA_Pts_Won,All_NBA_Pts_Max,All_NBA_Share,All_NBA_1st_Team_Votes,All_NBA_2nd_Team_Votes,All_NBA_3rd_Team_Votes,year,All_NBA_Boolean
0,Kareem Abdul-Jabbar*,C,41,LAL,74,1695,12.9,0.511,0.005,0.25,...,,,,,,,,,1988-1989,False
1,Mark Acres,C,26,BOS,62,632,8.2,0.507,0.009,0.421,...,,,,,,,,,1988-1989,False
2,Michael Adams,PG,26,DEN,77,2787,17.5,0.567,0.431,0.363,...,,,,,,,,,1988-1989,False
3,Mark Alarie,PF,25,WSB,74,1141,13.3,0.531,0.088,0.202,...,,,,,,,,,1988-1989,False
4,Randy Allen,SG,24,SAC,7,43,6.9,0.428,0.053,0.105,...,,,,,,,,,1988-1989,False


In [3]:
df_20_21 = full[full["year"] == '2020-2021']
df_20_21

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,All_NBA_Pos,All_NBA_Team,All_NBA_Pts_Won,All_NBA_Pts_Max,All_NBA_Share,All_NBA_1st_Team_Votes,All_NBA_2nd_Team_Votes,All_NBA_3rd_Team_Votes,year,All_NBA_Boolean
14298,Precious Achiuwa,PF,21,MIA,61,737,14.2,0.550,0.004,0.482,...,,,,,,,,,2020-2021,False
14299,Jaylen Adams,PG,24,MIL,7,18,-6.5,0.125,0.250,0.000,...,,,,,,,,,2020-2021,False
14300,Steven Adams,C,27,NOP,58,1605,15.1,0.596,0.010,0.438,...,,,,,,,,,2020-2021,False
14301,Bam Adebayo,C,23,MIA,64,2143,22.7,0.626,0.010,0.443,...,C,ORV,32.0,500.0,0.064,0.0,5.0,17.0,2020-2021,False
14302,Ty-Shon Alexander,SG,22,PHO,15,47,4.2,0.349,0.750,0.167,...,,,,,,,,,2020-2021,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14833,Moritz Wagner,C,23,TOT,45,722,12.5,0.580,0.397,0.380,...,,,,,,,,,2020-2021,False
14834,Brad Wanamaker,PG,31,TOT,61,1053,9.5,0.489,0.284,0.338,...,,,,,,,,,2020-2021,False
14835,Lou Williams,PG,34,TOT,66,1423,14.0,0.519,0.268,0.268,...,,,,,,,,,2020-2021,False
14836,D.J. Wilson,PF,24,TOT,35,433,11.5,0.511,0.518,0.149,...,,,,,,,,,2020-2021,False


In [4]:
df_19_20 = full[full["year"] == '2019-2020']
df_19_20

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,All_NBA_Pos,All_NBA_Team,All_NBA_Pts_Won,All_NBA_Pts_Max,All_NBA_Share,All_NBA_1st_Team_Votes,All_NBA_2nd_Team_Votes,All_NBA_3rd_Team_Votes,year,All_NBA_Boolean
13769,Steven Adams,C,26,OKC,63,1680,20.5,0.604,0.006,0.421,...,,,,,,,,,2019-2020,False
13770,Bam Adebayo,PF,22,MIA,72,2417,20.3,0.598,0.018,0.484,...,C,ORV,26.0,500.0,0.052,0.0,0.0,26.0,2019-2020,False
13771,LaMarcus Aldridge,C,34,SAS,53,1754,19.7,0.571,0.198,0.241,...,,,,,,,,,2019-2020,False
13772,Kyle Alexander,C,23,MIA,2,13,4.7,0.500,0.000,0.000,...,,,,,,,,,2019-2020,False
13773,Nickeil Alexander-Walker,SG,21,NOP,47,591,8.9,0.473,0.500,0.139,...,,,,,,,,,2019-2020,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14293,Dion Waiters,SG,28,TOT,10,207,11.5,0.521,0.475,0.172,...,,,,,,,,,2019-2020,False
14294,Derrick Walton,PG,24,TOT,26,248,8.7,0.617,0.615,0.231,...,,,,,,,,,2019-2020,False
14295,Paul Watson,SF,25,TOT,10,87,12.6,0.517,0.462,0.346,...,,,,,,,,,2019-2020,False
14296,Andrew Wiggins,SF,24,TOT,54,1858,16.4,0.536,0.342,0.274,...,,,,,,,,,2019-2020,False


## Train based on 19-20 Data
Train a QDA and LDA model with 19-20 data then test set will be for 2020-2021 Data

In [5]:
df_19_20_train = df_19_20.loc[:, ~df_19_20.columns.str.startswith('All_')]
df_19_20_train = df_19_20_train.select_dtypes(exclude=['object'])
df_19_20_train["All_NBA_Boolean"] = df_19_20["All_NBA_Boolean"].astype(str)
df_19_20_train
df_19_20_train

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
13769,26,63,1680,20.5,0.604,0.006,0.421,14.0,24.0,19.2,...,17.3,3.8,2.7,6.5,0.185,1.9,1.1,2.9,2.1,False
13770,22,72,2417,20.3,0.598,0.018,0.484,8.5,24.9,17.0,...,21.2,4.6,3.9,8.5,0.168,1.4,2.0,3.4,3.3,False
13771,34,53,1754,19.7,0.571,0.198,0.241,6.3,17.8,12.0,...,23.4,3.0,1.4,4.5,0.122,1.8,-0.5,1.4,1.5,False
13772,23,2,13,4.7,0.500,0.000,0.000,17.9,8.3,12.9,...,10.2,0.0,0.0,0.0,-0.003,-6.4,-3.7,-10.1,0.0,False
13773,21,47,591,8.9,0.473,0.500,0.139,1.6,13.5,7.5,...,23.3,-0.7,0.4,-0.2,-0.020,-3.2,-1.4,-4.6,-0.4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14293,28,10,207,11.5,0.521,0.475,0.172,1.1,11.5,6.4,...,25.2,-0.1,0.2,0.1,0.030,-0.7,-0.6,-1.2,0.0,False
14294,24,26,248,8.7,0.617,0.615,0.231,0.9,5.8,3.4,...,8.1,0.4,0.2,0.6,0.111,-2.2,1.3,-1.0,0.1,False
14295,25,10,87,12.6,0.517,0.462,0.346,2.5,17.9,10.3,...,16.1,0.0,0.1,0.2,0.097,-1.1,0.9,-0.2,0.0,False
14296,24,54,1858,16.4,0.536,0.342,0.274,3.6,12.3,7.8,...,27.4,0.8,1.1,1.9,0.050,1.1,-1.4,-0.3,0.8,False


In [6]:
df_19_20_train.sort_values("All_NBA_Boolean",ascending=False)

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
14168,25,60,2110,17.9,0.554,0.331,0.274,3.3,18.3,11.0,...,28.1,1.8,3.6,5.4,0.123,1.3,0.5,1.8,2.0,True
13918,27,68,2333,21.7,0.699,0.000,0.724,11.2,31.3,21.6,...,16.2,6.5,4.2,10.7,0.221,1.7,1.9,3.6,3.3,True
13937,30,68,2483,29.1,0.626,0.557,0.528,2.9,16.0,9.4,...,36.3,9.9,3.2,13.1,0.254,8.1,1.6,9.6,7.3,True
13841,30,58,1959,23.6,0.585,0.157,0.693,6.3,15.4,11.0,...,25.1,6.3,2.7,9.0,0.221,4.0,1.5,5.4,3.7,True
13993,24,73,2336,24.9,0.605,0.238,0.281,8.0,26.1,17.1,...,26.6,6.7,3.1,9.8,0.202,5.5,2.0,7.4,5.5,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13943,26,43,484,17.8,0.559,0.255,0.303,4.8,15.3,9.8,...,18.1,0.7,0.8,1.5,0.153,0.4,2.4,2.8,0.6,False
13942,27,72,2469,17.2,0.556,0.304,0.184,3.1,19.1,11.1,...,23.9,3.5,2.7,6.2,0.121,1.3,-0.5,0.8,1.8,False
13941,28,69,2123,13.3,0.606,0.518,0.114,3.2,11.2,7.3,...,18.3,2.5,1.8,4.3,0.097,0.4,-0.6,-0.2,1.0,False
13940,25,56,1780,9.7,0.519,0.405,0.177,1.6,8.6,5.1,...,15.1,0.5,1.4,1.9,0.052,-2.8,0.3,-2.5,-0.2,False


In [7]:
# flat_priors_lda = LDA(df_19_20_train.select_dtypes(exclude=['object']), 'All_NBA_Boolean')
flat_priors_lda = LDA(df_19_20_train, 'All_NBA_Boolean')

In [8]:
flat_priors_lda.predict_probs()

Unnamed: 0,False,True,True Class
0,0.957328,0.042672,False
1,0.066748,0.933252,False
2,0.997177,0.002823,False
3,0.999841,0.000159,False
4,0.999955,0.000045,False
...,...,...,...
524,0.999976,0.000024,False
525,0.999702,0.000298,False
526,0.999896,0.000104,False
527,0.999005,0.000995,False


In [9]:
flat_priors_lda.misclass_xtabs()

True Class,False,True
MAP Class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,502,2
True,10,13


In [10]:
flat_priors_lda.misclass_rate()

0.026465028355387554

## Test for 2020-2019

In [11]:
df_20_21_test = df_20_21.loc[:, ~df_20_21.columns.str.startswith('All_')]
df_20_21_test = df_20_21_test.select_dtypes(exclude=['object'])
df_20_21_test["All_NBA_Boolean"] = df_20_21["All_NBA_Boolean"].astype(str)
df_20_21_test

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
14298,21,61,737,14.2,0.550,0.004,0.482,11.5,20.6,16.1,...,19.5,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,False
14299,24,7,18,-6.5,0.125,0.250,0.000,0.0,16.9,8.8,...,18.6,-0.1,0.0,-0.1,-0.252,-15.1,-4.6,-19.8,-0.1,False
14300,27,58,1605,15.1,0.596,0.010,0.438,14.4,20.4,17.4,...,11.7,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,False
14301,23,64,2143,22.7,0.626,0.010,0.443,7.7,22.6,15.3,...,23.7,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,False
14302,22,15,47,4.2,0.349,0.750,0.167,4.9,19.0,12.1,...,15.0,-0.1,0.0,0.0,-0.048,-4.8,-1.7,-6.5,-0.1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14833,23,45,722,12.5,0.580,0.397,0.380,3.9,17.7,10.7,...,18.1,0.5,0.8,1.2,0.082,-2.7,0.2,-2.6,-0.1,False
14834,31,61,1053,9.5,0.489,0.284,0.338,1.8,8.8,5.3,...,16.9,-0.2,1.0,0.9,0.039,-4.4,0.3,-4.0,-0.5,False
14835,34,66,1423,14.0,0.519,0.268,0.268,1.7,8.9,5.4,...,25.7,0.5,1.2,1.7,0.056,-1.0,-1.5,-2.5,-0.2,False
14836,24,35,433,11.5,0.511,0.518,0.149,4.8,23.2,13.7,...,19.7,-0.2,0.5,0.3,0.032,-2.7,-0.6,-3.3,-0.1,False


In [12]:
flat_priors_lda.misclass_xtabs(data = df_20_21_test)

True Class,False,True
MAP Class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,509,0
True,15,15


In [13]:
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.029629629629629672

In [14]:
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.029629629629629672

## All Combinations of length 3

In [15]:
import itertools
error_rates_combinations = []
for i in itertools.combinations(df_19_20_train.columns[:-1], 3):
    combination = list(i) + ["All_NBA_Boolean"]
    flat_priors_lda = LDA(df_19_20_train[combination], 'All_NBA_Boolean')
    training_error = flat_priors_lda.misclass_rate()
    testing_error = flat_priors_lda.misclass_rate(data = df_20_21_test[combination])
    error_rates_combinations.append([",".join(combination[:-1]), training_error, testing_error])
df_error_rates_combinations = pd.DataFrame(error_rates_combinations)
df_error_rates_combinations.columns = ["combinations",\
                                       "training error",\
                                       "testing error"]
df_error_rates_combinations

Unnamed: 0,combinations,training error,testing error
0,"Age,G,MP",0.170132,0.179630
1,"Age,G,PER",0.128544,0.111111
2,"Age,G,TS%",0.381853,0.351852
3,"Age,G,3PAr",0.291115,0.268519
4,"Age,G,FTr",0.185255,0.172222
...,...,...,...
1766,"WS/48,BPM,VORP",0.024575,0.038889
1767,"OBPM,DBPM,BPM",0.103970,0.094444
1768,"OBPM,DBPM,VORP",0.022684,0.037037
1769,"OBPM,BPM,VORP",0.022684,0.037037


In [16]:
df_error_rates_combinations.sort_values("testing error").head(10)

Unnamed: 0,combinations,training error,testing error
1729,"OWS,WS/48,VORP",0.024575,0.027778
1732,"OWS,OBPM,VORP",0.022684,0.027778
1735,"OWS,BPM,VORP",0.022684,0.027778
780,"PER,OWS,VORP",0.024575,0.027778
609,"MP,OWS,VORP",0.024575,0.02963
1725,"OWS,WS,VORP",0.026465,0.02963
1754,"WS,WS/48,VORP",0.026465,0.02963
40,"Age,MP,VORP",0.022684,0.02963
1757,"WS,OBPM,VORP",0.022684,0.02963
1760,"WS,BPM,VORP",0.024575,0.02963


In [17]:
df_error_rates_combinations.sort_values("testing error")["testing error"].values[100] - \
df_error_rates_combinations.sort_values("testing error")["testing error"].values[0]

0.007407407407407418

The top 100 models had a difference of testing errors of 0.007407407407407418.

We can see which column of the data appeared the most.

In [18]:
lda_top100 = df_error_rates_combinations.sort_values("testing error").head(100)
lda_top100

Unnamed: 0,combinations,training error,testing error
1729,"OWS,WS/48,VORP",0.024575,0.027778
1732,"OWS,OBPM,VORP",0.022684,0.027778
1735,"OWS,BPM,VORP",0.022684,0.027778
780,"PER,OWS,VORP",0.024575,0.027778
609,"MP,OWS,VORP",0.024575,0.029630
...,...,...,...
439,"G,DBPM,VORP",0.028355,0.035185
425,"G,DWS,VORP",0.028355,0.035185
939,"TS%,DWS,VORP",0.026465,0.035185
349,"G,DRB%,VORP",0.024575,0.035185


In [19]:
counter_dict = {}
for i in lda_top100["combinations"]:
    for j in i.split(','):
        if j in counter_dict.keys():
            counter_dict[j] = counter_dict[j] + 1
        else:
            counter_dict[j] = 1
pd.DataFrame(counter_dict.items(), columns = ['Variable', 'Occurrence']).sort_values("Occurrence").sort_values("Occurrence", ascending=False)

Unnamed: 0,Variable,Occurrence
2,VORP,100
7,WS,21
0,OWS,19
6,MP,19
19,G,15
11,TS%,12
4,BPM,10
5,PER,10
8,Age,9
14,TOV%,9


In [20]:
combs = ["VORP", "WS", "OWS", "MP", "G", "TS%", "BPM", "PER", "Age", "TOV%"] + ["All_NBA_Boolean"]
flat_priors_lda1 = LDA(df_19_20_train[combs], 'All_NBA_Boolean')
flat_priors_lda1.misclass_rate(data = df_20_21_test[combs])

0.029629629629629672

In [21]:
flat_priors_lda = LDA(df_19_20_train, 'All_NBA_Boolean')
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.029629629629629672

In [22]:
flat_priors_lda = LDA(df_19_20_train, 'All_NBA_Boolean', priors=[0.1, 0.9])
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.061111111111111116

In [23]:
flat_priors_lda = LDA(df_19_20_train, 'All_NBA_Boolean', priors=[0.9, 0.1])
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.022222222222222254

In [24]:
## Using proportions
np.unique(df_19_20_train["All_NBA_Boolean"], return_counts=True)

(array(['False', 'True'], dtype=object), array([514,  15]))

In [25]:
prop_true = np.unique(df_19_20_train["All_NBA_Boolean"], return_counts=True)[1][1] / \
(np.unique(df_19_20_train["All_NBA_Boolean"], return_counts=True)[1][0] + \
 np.unique(df_19_20_train["All_NBA_Boolean"], return_counts=True)[1][1])
prop_true

0.02835538752362949

In [26]:
flat_priors_lda = LDA(df_19_20_train, 'All_NBA_Boolean', priors=[1-prop_true, prop_true])
flat_priors_lda.misclass_rate(data = df_20_21_test)

0.024074074074074026

## Models for each position
There are three categories of Positions. 

In [27]:
df_19_20_train = df_19_20.loc[:, ~df_19_20.columns.str.startswith('All_')]
df_19_20_train = df_19_20_train.select_dtypes(exclude=['object'])
df_19_20_train["All_NBA_Boolean"] = df_19_20["All_NBA_Boolean"].astype(str)
df_19_20_train["Pos"] = df_19_20["Pos"].astype(str)
df_19_20_train

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean,Pos
13769,26,63,1680,20.5,0.604,0.006,0.421,14.0,24.0,19.2,...,3.8,2.7,6.5,0.185,1.9,1.1,2.9,2.1,False,C
13770,22,72,2417,20.3,0.598,0.018,0.484,8.5,24.9,17.0,...,4.6,3.9,8.5,0.168,1.4,2.0,3.4,3.3,False,PF
13771,34,53,1754,19.7,0.571,0.198,0.241,6.3,17.8,12.0,...,3.0,1.4,4.5,0.122,1.8,-0.5,1.4,1.5,False,C
13772,23,2,13,4.7,0.500,0.000,0.000,17.9,8.3,12.9,...,0.0,0.0,0.0,-0.003,-6.4,-3.7,-10.1,0.0,False,C
13773,21,47,591,8.9,0.473,0.500,0.139,1.6,13.5,7.5,...,-0.7,0.4,-0.2,-0.020,-3.2,-1.4,-4.6,-0.4,False,SG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14293,28,10,207,11.5,0.521,0.475,0.172,1.1,11.5,6.4,...,-0.1,0.2,0.1,0.030,-0.7,-0.6,-1.2,0.0,False,SG
14294,24,26,248,8.7,0.617,0.615,0.231,0.9,5.8,3.4,...,0.4,0.2,0.6,0.111,-2.2,1.3,-1.0,0.1,False,PG
14295,25,10,87,12.6,0.517,0.462,0.346,2.5,17.9,10.3,...,0.0,0.1,0.2,0.097,-1.1,0.9,-0.2,0.0,False,SF
14296,24,54,1858,16.4,0.536,0.342,0.274,3.6,12.3,7.8,...,0.8,1.1,1.9,0.050,1.1,-1.4,-0.3,0.8,False,SF


In [28]:
full['Pos'].unique()

array(['C', 'PG', 'PF', 'SG', 'SF', 'SF-SG', 'PF-C', 'PF-SF', 'SG-SF',
       'PG-SG', 'SG-PG', 'SF-PF', 'C-PF', 'PG-SF', 'SG-PF', 'SF-C'],
      dtype=object)

In [29]:
center_19_20 = df_19_20_train[df_19_20_train["Pos"].str.contains('C')]
center_19_20

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean,Pos
13769,26,63,1680,20.5,0.604,0.006,0.421,14.0,24.0,19.2,...,3.8,2.7,6.5,0.185,1.9,1.1,2.9,2.1,False,C
13771,34,53,1754,19.7,0.571,0.198,0.241,6.3,17.8,12.0,...,3.0,1.4,4.5,0.122,1.8,-0.5,1.4,1.5,False,C
13772,23,2,13,4.7,0.500,0.000,0.000,17.9,8.3,12.9,...,0.0,0.0,0.0,-0.003,-6.4,-3.7,-10.1,0.0,False,C
13775,21,70,1852,20.7,0.664,0.013,0.581,12.3,24.9,18.7,...,5.2,2.9,8.2,0.212,1.4,1.0,2.3,2.0,False,C
13780,31,2,14,9.4,0.357,0.714,0.000,0.0,52.9,26.2,...,0.0,0.0,0.0,-0.037,-3.3,0.5,-2.8,0.0,False,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14269,29,40,600,17.0,0.578,0.211,0.269,8.5,21.8,15.1,...,0.6,0.8,1.4,0.109,-1.2,2.1,0.9,0.4,False,C
14273,32,32,619,14.8,0.572,0.423,0.223,4.3,16.3,10.3,...,0.3,0.8,1.1,0.089,-1.1,1.2,0.0,0.3,False,PF-C
14277,26,55,970,17.7,0.600,0.185,0.329,11.1,25.2,18.0,...,1.5,1.0,2.5,0.122,-0.5,-0.3,-0.7,0.3,False,C
14287,22,27,288,7.6,0.597,0.043,0.362,9.9,26.4,18.3,...,-0.2,0.2,0.0,0.004,-6.3,0.2,-6.0,-0.3,False,PF-C


In [30]:
center_19_20['Pos'].unique()

array(['C', 'PF-C', 'SF-C', 'C-PF'], dtype=object)

In [31]:
center_19_20.drop(['Pos'], axis=1, inplace=True)
center_19_20

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
13769,26,63,1680,20.5,0.604,0.006,0.421,14.0,24.0,19.2,...,17.3,3.8,2.7,6.5,0.185,1.9,1.1,2.9,2.1,False
13771,34,53,1754,19.7,0.571,0.198,0.241,6.3,17.8,12.0,...,23.4,3.0,1.4,4.5,0.122,1.8,-0.5,1.4,1.5,False
13772,23,2,13,4.7,0.500,0.000,0.000,17.9,8.3,12.9,...,10.2,0.0,0.0,0.0,-0.003,-6.4,-3.7,-10.1,0.0,False
13775,21,70,1852,20.7,0.664,0.013,0.581,12.3,24.9,18.7,...,14.9,5.2,2.9,8.2,0.212,1.4,1.0,2.3,2.0,False
13780,31,2,14,9.4,0.357,0.714,0.000,0.0,52.9,26.2,...,23.7,0.0,0.0,0.0,-0.037,-3.3,0.5,-2.8,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14269,29,40,600,17.0,0.578,0.211,0.269,8.5,21.8,15.1,...,16.3,0.6,0.8,1.4,0.109,-1.2,2.1,0.9,0.4,False
14273,32,32,619,14.8,0.572,0.423,0.223,4.3,16.3,10.3,...,19.4,0.3,0.8,1.1,0.089,-1.1,1.2,0.0,0.3,False
14277,26,55,970,17.7,0.600,0.185,0.329,11.1,25.2,18.0,...,18.1,1.5,1.0,2.5,0.122,-0.5,-0.3,-0.7,0.3,False
14287,22,27,288,7.6,0.597,0.043,0.362,9.9,26.4,18.3,...,13.5,-0.2,0.2,0.0,0.004,-6.3,0.2,-6.0,-0.3,False


In [32]:
df_20_21_test = df_20_21.loc[:, ~df_20_21.columns.str.startswith('All_')]
df_20_21_test = df_20_21_test.select_dtypes(exclude=['object'])
df_20_21_test["All_NBA_Boolean"] = df_20_21["All_NBA_Boolean"].astype(str)
df_20_21_test["Pos"] = df_20_21["Pos"].astype(str)
df_20_21_test

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean,Pos
14298,21,61,737,14.2,0.550,0.004,0.482,11.5,20.6,16.1,...,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,False,PF
14299,24,7,18,-6.5,0.125,0.250,0.000,0.0,16.9,8.8,...,-0.1,0.0,-0.1,-0.252,-15.1,-4.6,-19.8,-0.1,False,PG
14300,27,58,1605,15.1,0.596,0.010,0.438,14.4,20.4,17.4,...,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,False,C
14301,23,64,2143,22.7,0.626,0.010,0.443,7.7,22.6,15.3,...,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,False,C
14302,22,15,47,4.2,0.349,0.750,0.167,4.9,19.0,12.1,...,-0.1,0.0,0.0,-0.048,-4.8,-1.7,-6.5,-0.1,False,SG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14833,23,45,722,12.5,0.580,0.397,0.380,3.9,17.7,10.7,...,0.5,0.8,1.2,0.082,-2.7,0.2,-2.6,-0.1,False,C
14834,31,61,1053,9.5,0.489,0.284,0.338,1.8,8.8,5.3,...,-0.2,1.0,0.9,0.039,-4.4,0.3,-4.0,-0.5,False,PG
14835,34,66,1423,14.0,0.519,0.268,0.268,1.7,8.9,5.4,...,0.5,1.2,1.7,0.056,-1.0,-1.5,-2.5,-0.2,False,PG
14836,24,35,433,11.5,0.511,0.518,0.149,4.8,23.2,13.7,...,-0.2,0.5,0.3,0.032,-2.7,-0.6,-3.3,-0.1,False,PF


In [33]:
center_20_21 = df_20_21_test[df_20_21_test["Pos"].str.contains('C')]
center_20_21

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean,Pos
14300,27,58,1605,15.1,0.596,0.010,0.438,14.4,20.4,17.4,...,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,False,C
14301,23,64,2143,22.7,0.626,0.010,0.443,7.7,22.6,15.3,...,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,False,C
14315,22,69,2115,20.3,0.653,0.029,0.252,12.4,26.3,19.5,...,5.3,3.1,8.4,0.191,1.1,0.1,1.2,1.7,False,C
14316,21,15,57,10.3,0.597,0.000,1.111,7.8,16.0,12.1,...,0.0,0.1,0.1,0.119,-5.5,1.7,-3.8,0.0,False,C
14321,22,46,725,19.3,0.556,0.392,0.219,10.5,28.0,19.0,...,0.7,1.1,1.8,0.117,0.2,0.0,0.1,0.4,False,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14812,29,70,1997,16.4,0.610,0.528,0.251,4.8,22.5,13.6,...,2.5,2.6,5.1,0.121,0.3,0.9,1.2,1.6,False,C-PF
14814,27,13,84,11.6,0.567,0.000,0.400,8.1,16.5,12.4,...,0.1,0.1,0.2,0.112,-4.5,1.9,-2.6,0.0,False,C
14826,28,65,1601,15.0,0.609,0.326,0.236,6.5,18.3,12.4,...,2.4,1.8,4.3,0.128,-0.7,0.7,0.0,0.8,False,C
14832,30,70,2348,22.9,0.560,0.315,0.112,6.5,30.9,18.5,...,3.3,3.0,6.3,0.129,4.9,0.4,5.3,4.4,False,C


In [34]:
center_20_21.drop(['Pos'], axis=1, inplace=True)
center_20_21

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
14300,27,58,1605,15.1,0.596,0.010,0.438,14.4,20.4,17.4,...,11.7,2.3,1.7,4.0,0.119,-0.4,0.1,-0.3,0.7,False
14301,23,64,2143,22.7,0.626,0.010,0.443,7.7,22.6,15.3,...,23.7,5.6,3.2,8.8,0.197,2.9,2.0,4.9,3.7,False
14315,22,69,2115,20.3,0.653,0.029,0.252,12.4,26.3,19.5,...,18.2,5.3,3.1,8.4,0.191,1.1,0.1,1.2,1.7,False
14316,21,15,57,10.3,0.597,0.000,1.111,7.8,16.0,12.1,...,12.4,0.0,0.1,0.1,0.119,-5.5,1.7,-3.8,0.0,False
14321,22,46,725,19.3,0.556,0.392,0.219,10.5,28.0,19.0,...,21.8,0.7,1.1,1.8,0.117,0.2,0.0,0.1,0.4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14812,29,70,1997,16.4,0.610,0.528,0.251,4.8,22.5,13.6,...,19.7,2.5,2.6,5.1,0.121,0.3,0.9,1.2,1.6,False
14814,27,13,84,11.6,0.567,0.000,0.400,8.1,16.5,12.4,...,10.8,0.1,0.1,0.2,0.112,-4.5,1.9,-2.6,0.0,False
14826,28,65,1601,15.0,0.609,0.326,0.236,6.5,18.3,12.4,...,15.6,2.4,1.8,4.3,0.128,-0.7,0.7,0.0,0.8,False
14832,30,70,2348,22.9,0.560,0.315,0.112,6.5,30.9,18.5,...,29.3,3.3,3.0,6.3,0.129,4.9,0.4,5.3,4.4,False


In [35]:
flat_priors_lda_center = LDA(center_19_20, 'All_NBA_Boolean')
flat_priors_lda.misclass_rate(data = center_20_21)

0.03960396039603964

In [36]:
flat_priors_lda.misclass_rate()

0.013232514177693777

In [37]:
flat_priors_lda.misclass_xtabs(data = center_20_21)

True Class,False,True
MAP Class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,95,1
True,3,2


#### Forwards

In [38]:
forward_19_20 = df_19_20_train[df_19_20_train["Pos"].str.contains('F')]
print(forward_19_20['Pos'].unique())
forward_19_20.drop(['Pos'], axis=1, inplace=True)
forward_19_20

['PF' 'SF' 'PF-SF' 'PF-C' 'SF-SG' 'SF-C' 'SF-PF' 'SG-SF' 'C-PF']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
13770,22,72,2417,20.3,0.598,0.018,0.484,8.5,24.9,17.0,...,21.2,4.6,3.9,8.5,0.168,1.4,2.0,3.4,3.3,False
13777,29,18,380,7.6,0.395,0.419,0.337,6.6,18.4,12.3,...,13.2,-0.4,0.5,0.1,0.014,-4.4,1.4,-3.0,-0.1,False
13779,26,67,1330,12.5,0.534,0.257,0.236,4.7,17.6,11.3,...,13.5,0.9,1.7,2.5,0.091,-1.5,2.0,0.5,0.9,False
13781,25,63,1917,31.9,0.613,0.237,0.508,7.7,34.8,22.1,...,37.5,6.1,5.0,11.1,0.279,7.4,4.1,11.5,6.6,True
13782,22,5,20,16.3,0.902,0.000,0.667,11.1,5.4,8.2,...,10.3,0.1,0.0,0.1,0.193,0.2,-0.9,-0.7,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14290,26,7,53,8.8,0.503,0.500,0.111,4.0,8.6,6.1,...,14.8,0.0,0.0,0.0,0.018,-2.2,-2.4,-4.6,0.0,False
14291,20,11,46,7.6,0.676,0.125,0.250,7.3,14.5,10.9,...,14.9,-0.1,0.1,0.0,-0.026,-6.8,0.5,-6.3,-0.1,False
14295,25,10,87,12.6,0.517,0.462,0.346,2.5,17.9,10.3,...,16.1,0.0,0.1,0.2,0.097,-1.1,0.9,-0.2,0.0,False
14296,24,54,1858,16.4,0.536,0.342,0.274,3.6,12.3,7.8,...,27.4,0.8,1.1,1.9,0.050,1.1,-1.4,-0.3,0.8,False


In [39]:
forward_20_21 = df_20_21_test[df_20_21_test["Pos"].str.contains('F')]
print(forward_20_21['Pos'].unique())
forward_20_21.drop(['Pos'], axis=1, inplace=True)
forward_20_21

['PF' 'SF' 'PF-SF' 'SF-PF' 'SG-SF' 'SF-SG' 'PF-C' 'C-PF']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
14298,21,61,737,14.2,0.550,0.004,0.482,11.5,20.6,16.1,...,19.5,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,False
14305,27,69,1887,17.2,0.578,0.397,0.280,2.9,19.5,11.1,...,18.5,2.9,2.7,5.6,0.143,1.4,1.9,3.3,2.5,False
14306,26,61,2013,29.2,0.633,0.201,0.528,5.3,28.9,17.5,...,32.5,6.9,3.3,10.2,0.244,6.2,2.8,9.0,5.6,True
14307,23,15,56,0.0,0.382,0.000,1.300,8.1,29.9,19.1,...,20.7,-0.3,0.1,-0.2,-0.174,-13.3,0.8,-12.6,-0.1,False
14308,28,57,551,10.3,0.523,0.212,0.358,10.7,12.9,11.8,...,15.3,0.0,0.6,0.7,0.057,-3.7,0.2,-3.5,-0.2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14817,26,41,905,12.1,0.540,0.474,0.232,2.3,15.3,8.8,...,19.6,0.2,0.8,1.0,0.052,-1.2,-0.6,-1.8,0.0,False
14819,25,5,40,6.5,0.417,0.667,0.000,5.1,8.3,6.7,...,19.2,0.0,0.0,0.0,-0.039,-3.5,-4.6,-8.1,-0.1,False
14823,24,15,92,11.5,0.708,0.063,0.938,8.9,24.4,16.8,...,15.9,0.1,0.1,0.2,0.101,-6.7,1.1,-5.5,-0.1,False
14831,35,52,1356,5.5,0.510,0.672,0.158,3.9,12.3,8.0,...,7.2,0.1,0.9,1.0,0.034,-4.6,0.5,-4.1,-0.7,False


In [40]:
flat_priors_lda_forward = LDA(forward_19_20, 'All_NBA_Boolean')
flat_priors_lda.misclass_rate(data = forward_20_21)

0.014018691588784993

In [41]:
flat_priors_lda.misclass_rate()

0.013232514177693777

In [42]:
flat_priors_lda.misclass_xtabs(data = forward_20_21)

True Class,False,True
MAP Class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,207,1
True,2,4


#### guard

In [43]:
guard_19_20 = df_19_20_train[df_19_20_train["Pos"].str.contains('G')]
print(guard_19_20['Pos'].unique())
guard_19_20.drop(['Pos'], axis=1, inplace=True)
guard_19_20

['SG' 'PG' 'SF-SG' 'SG-SF' 'SG-PG' 'PG-SG']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
13773,21,47,591,8.9,0.473,0.500,0.139,1.6,13.5,7.5,...,23.3,-0.7,0.4,-0.2,-0.020,-3.2,-1.4,-4.6,-0.4,False
13774,24,38,718,12.0,0.609,0.562,0.179,1.2,11.1,6.2,...,17.6,0.8,0.4,1.2,0.082,-0.1,-1.2,-1.3,0.1,False
13776,27,10,117,14.0,0.512,0.364,0.250,1.8,6.6,4.1,...,20.6,0.1,0.1,0.1,0.053,-0.3,-0.7,-0.9,0.0,False
13778,26,10,107,3.2,0.352,0.763,0.105,1.0,18.9,10.2,...,17.1,-0.2,0.1,-0.1,-0.054,-6.8,-1.0,-7.7,-0.2,False
13786,25,58,930,9.0,0.551,0.627,0.173,2.3,11.2,6.5,...,12.5,0.7,0.7,1.4,0.071,-2.7,-0.1,-2.8,-0.2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14285,22,63,979,7.4,0.476,0.494,0.184,1.4,12.8,7.0,...,16.4,-0.6,0.8,0.2,0.009,-3.8,-0.2,-3.9,-0.5,False
14286,23,45,1452,18.7,0.556,0.509,0.235,1.2,12.3,6.6,...,31.5,1.1,0.7,1.8,0.060,3.8,-1.9,1.9,1.4,False
14288,31,59,1464,14.6,0.557,0.239,0.403,1.9,8.6,5.2,...,19.5,1.9,0.6,2.5,0.082,-0.2,-1.3,-1.5,0.2,False
14293,28,10,207,11.5,0.521,0.475,0.172,1.1,11.5,6.4,...,25.2,-0.1,0.2,0.1,0.030,-0.7,-0.6,-1.2,0.0,False


In [44]:
guard_20_21 = df_20_21_test[df_20_21_test["Pos"].str.contains('G')]
print(guard_20_21['Pos'].unique())
guard_20_21.drop(['Pos'], axis=1, inplace=True)
guard_20_21

['PG' 'SG' 'SG-SF' 'SF-SG' 'SG-PG' 'PG-SG']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All_NBA_Boolean
14299,24,7,18,-6.5,0.125,0.250,0.000,0.0,16.9,8.8,...,18.6,-0.1,0.0,-0.1,-0.252,-15.1,-4.6,-19.8,-0.1,False
14302,22,15,47,4.2,0.349,0.750,0.167,4.9,19.0,12.1,...,15.0,-0.1,0.0,0.0,-0.048,-4.8,-1.7,-6.5,-0.1,False
14303,22,46,1007,12.5,0.522,0.478,0.144,1.4,14.1,7.8,...,23.2,-0.3,1.0,0.7,0.035,-1.4,0.1,-1.3,0.2,False
14304,25,50,1259,12.8,0.586,0.662,0.220,1.6,12.0,6.7,...,16.8,1.5,1.2,2.7,0.101,-0.2,0.1,-0.2,0.6,False
14310,20,47,1273,12.1,0.496,0.312,0.237,3.0,15.4,9.0,...,24.3,-0.9,0.9,0.0,-0.001,-1.9,-1.4,-3.3,-0.4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14829,26,21,217,6.6,0.414,0.500,0.174,1.4,7.4,4.4,...,11.1,-0.2,0.2,0.0,0.009,-5.7,0.9,-4.7,-0.2,False
14830,22,58,1802,12.1,0.534,0.546,0.135,1.6,7.7,4.6,...,20.9,1.0,0.7,1.7,0.045,-0.5,-1.7,-2.3,-0.1,False
14834,31,61,1053,9.5,0.489,0.284,0.338,1.8,8.8,5.3,...,16.9,-0.2,1.0,0.9,0.039,-4.4,0.3,-4.0,-0.5,False
14835,34,66,1423,14.0,0.519,0.268,0.268,1.7,8.9,5.4,...,25.7,0.5,1.2,1.7,0.056,-1.0,-1.5,-2.5,-0.2,False


In [45]:
flat_priors_lda_guard = LDA(guard_19_20, 'All_NBA_Boolean')
flat_priors_lda.misclass_rate(data = guard_20_21)

0.025751072961373356

In [46]:
flat_priors_lda.misclass_rate()

0.013232514177693777

In [47]:
flat_priors_lda.misclass_xtabs(data = guard_20_21)

True Class,False,True
MAP Class,Unnamed: 1_level_1,Unnamed: 2_level_1
False,223,3
True,2,4
