In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

from scipy.stats import pearsonr

import warnings
warnings.filterwarnings("ignore")

We will begin by looking through the available data, determining which variables we will want to include in our model.

In [2]:
batting = pd.read_csv('data/core/Batting.csv')
batting

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110490,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
110491,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
110492,zimmery01,2021,1,WAS,NL,110,255,27,62,16,...,46.0,0.0,0.0,16,77.0,0.0,0.0,0.0,2.0,9.0
110493,zuberty01,2021,1,KCA,AL,31,1,0,0,0,...,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0


It looks like some entries are NaN, we have to look into the history of MLB statistics tracking to understand that certain stats were not always recorded:

SH: sacrifice hits

SF: sacrifice flies

GIDP: grounded into double play

IBB: intentional walks

HBP: hit by pitch

We will set these to zero now, but will need to either drop those columns, or the players before that stat was collected before inputting this data into our models

In [3]:
batting = batting.fillna(0)

In [4]:
batting.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'G', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH',
       'SF', 'GIDP'],
      dtype='object')

In [5]:
batting = batting.drop(columns = ['teamID', 'CS', 'lgID', 'IBB', 'HBP', 'SH', 'SF', 'GIDP'])

In [6]:
# batting = batting.drop(columns = ['stint', 'teamID', 'lgID', 'IBB', 'HBP', 'SH', 'SF', 'GIDP'])
batting.columns

Index(['playerID', 'yearID', 'stint', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR',
       'RBI', 'SB', 'BB', 'SO'],
      dtype='object')

In [7]:
batting

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
0,abercda01,1871,1,1,4,0,0,0,0,0,0.0,0.0,0,0.0
1,addybo01,1871,1,25,118,30,32,6,0,0,13.0,8.0,4,0.0
2,allisar01,1871,1,29,137,28,40,4,5,0,19.0,3.0,2,5.0
3,allisdo01,1871,1,27,133,28,44,10,2,2,27.0,1.0,0,2.0
4,ansonca01,1871,1,25,120,29,39,11,3,0,16.0,6.0,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110490,zimmejo02,2021,1,2,1,0,0,0,0,0,0.0,0.0,0,1.0
110491,zimmeky01,2021,1,52,0,0,0,0,0,0,0.0,0.0,0,0.0
110492,zimmery01,2021,1,110,255,27,62,16,0,14,46.0,0.0,16,77.0
110493,zuberty01,2021,1,31,1,0,0,0,0,0,0.0,0.0,0,1.0


In [8]:
# Ineligible
batting = batting[batting['yearID'] <= 2016]
batting

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO
0,abercda01,1871,1,1,4,0,0,0,0,0,0.0,0.0,0,0.0
1,addybo01,1871,1,25,118,30,32,6,0,0,13.0,8.0,4,0.0
2,allisar01,1871,1,29,137,28,40,4,5,0,19.0,3.0,2,5.0
3,allisdo01,1871,1,27,133,28,44,10,2,2,27.0,1.0,0,2.0
4,ansonca01,1871,1,25,120,29,39,11,3,0,16.0,6.0,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102826,zimmejo02,2016,1,19,4,0,1,0,0,0,0.0,0.0,0,2.0
102827,zimmery01,2016,1,115,427,60,93,18,1,15,46.0,4.0,29,104.0
102828,zobribe01,2016,1,147,523,94,142,31,3,18,76.0,6.0,96,82.0
102829,zuninmi01,2016,1,55,164,16,34,7,0,12,31.0,0.0,21,65.0


In [9]:
years_service = batting.groupby(['playerID', 'yearID']).size().sum(level = 0)
years_service

playerID
aardsda01     9
aaronha01    23
aaronto01     7
aasedo01     13
abadan01      3
             ..
zupofr01      3
zuvelpa01     9
zuverge01    10
zwilldu01     4
zychto01      2
Length: 18919, dtype: int64

In [10]:
batting = batting.drop(columns = ['yearID', 'stint'])
by_player = batting.groupby(['playerID']).sum()
by_player['years_service'] = years_service
by_player

Unnamed: 0_level_0,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
aardsda01,331,4,0,0,0,0,0,0.0,0.0,0,2.0,9
aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23
aaronto01,437,944,102,216,42,6,13,94.0,9.0,86,145.0,7
aasedo01,448,5,0,0,0,0,0,0.0,0.0,0,3.0,13
abadan01,15,21,1,2,0,0,0,0.0,0.0,4,5.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
zupofr01,16,18,3,3,1,0,0,0.0,0.0,2,6.0,3
zuvelpa01,209,491,41,109,17,2,2,20.0,2.0,34,50.0,9
zuverge01,266,142,5,21,2,1,0,7.0,0.0,9,39.0,10
zwilldu01,366,1280,167,364,76,15,30,202.0,46.0,128,155.0,4


In [11]:
by_player_eligible = by_player[by_player['years_service'] >= 10]
by_player_eligible['years_service']

playerID
aaronha01    23
aasedo01     13
abbated01    10
abbotgl01    12
abbotji01    11
             ..
zitoba01     15
zobribe01    12
zoldasa01    10
zuberbi01    12
zuverge01    10
Name: years_service, Length: 3864, dtype: int64

The way I calculated the years isn't correct, upon further research this player debuted in 1898, retired in 1924, but had several years off in between

First groupby stint, summing over all columns except year, then groupby.size() ???

In [12]:
people = pd.read_csv('data/core/People.csv')
people_feats = people[['playerID', 'bats', 'throws']]
people_feats = people_feats.set_index('playerID')
by_player_eligible = by_player_eligible.merge(people_feats, left_index = True, right_index = True)
by_player_eligible

Unnamed: 0_level_0,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service,bats,throws
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23,R,R
aasedo01,448,5,0,0,0,0,0,0.0,0.0,0,3.0,13,R,R
abbated01,855,3044,355,772,99,43,11,324.0,142.0,289,283.0,10,R,R
abbotgl01,248,0,0,0,0,0,0,0.0,0.0,0,0.0,12,R,R
abbotji01,263,21,0,2,0,0,0,3.0,0.0,0,10.0,11,L,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zitoba01,434,344,12,35,0,0,0,11.0,0.0,18,99.0,15,L,L
zobribe01,1337,4840,735,1287,296,38,145,643.0,111.0,700,839.0,12,B,R
zoldasa01,251,286,16,50,6,0,0,11.0,1.0,10,52.0,10,L,L
zuberbi01,224,229,10,31,4,5,0,11.0,0.0,10,66.0,12,R,R


In [13]:
by_player_eligible = pd.get_dummies(by_player_eligible, columns=['bats', 'throws'])
by_player_eligible

Unnamed: 0_level_0,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service,bats_B,bats_L,bats_R,throws_L,throws_R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23,0,0,1,0,1
aasedo01,448,5,0,0,0,0,0,0.0,0.0,0,3.0,13,0,0,1,0,1
abbated01,855,3044,355,772,99,43,11,324.0,142.0,289,283.0,10,0,0,1,0,1
abbotgl01,248,0,0,0,0,0,0,0.0,0.0,0,0.0,12,0,0,1,0,1
abbotji01,263,21,0,2,0,0,0,3.0,0.0,0,10.0,11,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zitoba01,434,344,12,35,0,0,0,11.0,0.0,18,99.0,15,0,1,0,1,0
zobribe01,1337,4840,735,1287,296,38,145,643.0,111.0,700,839.0,12,1,0,0,0,1
zoldasa01,251,286,16,50,6,0,0,11.0,1.0,10,52.0,10,0,1,0,1,0
zuberbi01,224,229,10,31,4,5,0,11.0,0.0,10,66.0,12,0,0,1,0,1


In [14]:
fielding = pd.read_csv('data/core/Fielding.csv')
pos = pd.DataFrame(fielding.groupby('playerID')['POS'].max())
pos

Unnamed: 0_level_0,POS
playerID,Unnamed: 1_level_1
aardsda01,P
aaronha01,OF
aaronto01,OF
aasedo01,P
abadan01,OF
...,...
zupofr01,C
zuvelpa01,SS
zuverge01,P
zwilldu01,OF


In [15]:
by_player_eligible = by_player_eligible.merge(pos, left_index= True, right_index = True)
by_player_eligible

Unnamed: 0_level_0,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service,bats_B,bats_L,bats_R,throws_L,throws_R,POS
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23,0,0,1,0,1,OF
aasedo01,448,5,0,0,0,0,0,0.0,0.0,0,3.0,13,0,0,1,0,1,P
abbated01,855,3044,355,772,99,43,11,324.0,142.0,289,283.0,10,0,0,1,0,1,SS
abbotgl01,248,0,0,0,0,0,0,0.0,0.0,0,0.0,12,0,0,1,0,1,P
abbotji01,263,21,0,2,0,0,0,3.0,0.0,0,10.0,11,0,1,0,1,0,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zitoba01,434,344,12,35,0,0,0,11.0,0.0,18,99.0,15,0,1,0,1,0,P
zobribe01,1337,4840,735,1287,296,38,145,643.0,111.0,700,839.0,12,1,0,0,0,1,SS
zoldasa01,251,286,16,50,6,0,0,11.0,1.0,10,52.0,10,0,1,0,1,0,P
zuberbi01,224,229,10,31,4,5,0,11.0,0.0,10,66.0,12,0,0,1,0,1,P


I will remove the AB column, replacing it with a batting average column, that is is Hits / (AB - Walks)

In [16]:
by_player_eligible['Avg'] = by_player_eligible['H'] / (by_player_eligible['AB'] - by_player_eligible['BB'])
by_player_eligible = by_player_eligible.dropna()
temp = by_player_eligible.copy()
by_player_eligible = by_player_eligible.drop(columns = 'AB')
by_player_eligible

Unnamed: 0_level_0,G,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service,bats_B,bats_L,bats_R,throws_L,throws_R,POS,Avg
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aaronha01,3298,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23,0,0,1,0,1,OF,0.344007
aasedo01,448,0,0,0,0,0,0.0,0.0,0,3.0,13,0,0,1,0,1,P,0.000000
abbated01,855,355,772,99,43,11,324.0,142.0,289,283.0,10,0,0,1,0,1,SS,0.280218
abbotji01,263,0,2,0,0,0,3.0,0.0,0,10.0,11,0,1,0,1,0,P,0.095238
abbotku01,702,273,523,109,23,62,242.0,22.0,133,571.0,10,0,0,1,0,1,SS,0.273679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zitoba01,434,12,35,0,0,0,11.0,0.0,18,99.0,15,0,1,0,1,0,P,0.107362
zobribe01,1337,735,1287,296,38,145,643.0,111.0,700,839.0,12,1,0,0,0,1,SS,0.310870
zoldasa01,251,16,50,6,0,0,11.0,1.0,10,52.0,10,0,1,0,1,0,P,0.181159
zuberbi01,224,10,31,4,5,0,11.0,0.0,10,66.0,12,0,0,1,0,1,P,0.141553


Now we have all the variables we think will be useful for our analysis. We will look at the data more closely.

We still need to get our target variable, we will get that from a separate dataframe.

In [17]:
hof = pd.read_csv('data/contrib/HallOfFame.csv')
hof_inducted = hof[hof['inducted'] == 'Y']
hof_inducted

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,
...,...,...,...,...,...,...,...,...,...
4157,guerrvl01,2018,BBWAA,422.0,317.0,392.0,Y,Player,
4158,thomeji01,2018,BBWAA,422.0,317.0,379.0,Y,Player,
4159,hoffmtr01,2018,BBWAA,422.0,317.0,337.0,Y,Player,
4189,morrija02,2018,Veterans,,,,Y,Player,


In [18]:
idx = pd.Index(by_player_eligible.index)
hof_inductees = idx.isin(hof_inducted['playerID'])
len(hof_inductees)

3822

In [19]:
by_player_eligible['hof'] = hof_inductees
by_player_eligible['hof'].value_counts()

False    3581
True      241
Name: hof, dtype: int64

There is some discrepancy in the number of hall of famers, because the hof data frame also includes pitchers.

In [20]:
by_player_eligible

Unnamed: 0_level_0,G,R,H,2B,3B,HR,RBI,SB,BB,SO,years_service,bats_B,bats_L,bats_R,throws_L,throws_R,POS,Avg,hof
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
aaronha01,3298,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,23,0,0,1,0,1,OF,0.344007,True
aasedo01,448,0,0,0,0,0,0.0,0.0,0,3.0,13,0,0,1,0,1,P,0.000000,False
abbated01,855,355,772,99,43,11,324.0,142.0,289,283.0,10,0,0,1,0,1,SS,0.280218,False
abbotji01,263,0,2,0,0,0,3.0,0.0,0,10.0,11,0,1,0,1,0,P,0.095238,False
abbotku01,702,273,523,109,23,62,242.0,22.0,133,571.0,10,0,0,1,0,1,SS,0.273679,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zitoba01,434,12,35,0,0,0,11.0,0.0,18,99.0,15,0,1,0,1,0,P,0.107362,False
zobribe01,1337,735,1287,296,38,145,643.0,111.0,700,839.0,12,1,0,0,0,1,SS,0.310870,False
zoldasa01,251,16,50,6,0,0,11.0,1.0,10,52.0,10,0,1,0,1,0,P,0.181159,False
zuberbi01,224,10,31,4,5,0,11.0,0.0,10,66.0,12,0,0,1,0,1,P,0.141553,False


In [21]:
corr_mat = by_player_eligible.corr()
fig = px.imshow(corr_mat, width = 800, height = 500)
fig.write_html("corr_heatmap.html")
fig.show()

In [22]:
fig = px.histogram(by_player_eligible, x = 'POS', color = 'hof', title = "Hall of Famers by Position", height = 500)
fig.write_html("hof_by_pos.html")
fig.show()

In [23]:
by_player_eligible = by_player_eligible[by_player_eligible['POS'] != 'P']

In [24]:
by_tenure = by_player_eligible.sort_values(by = 'years_service')
fig = px.histogram(by_tenure, x = 'hof', title = "Hall of Fame Counts")
fig.write_html("hof_counts.html")
fig.show()

In [25]:
is_hof = by_tenure[by_tenure['hof'] == True]
is_not = by_tenure[by_tenure['hof'] == False]


labels = ['Hall Of Famer', 'Not Hall of Famer']
values = [is_hof.shape[0], is_not.shape[0]]

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0.2, 0])])
fig.write_html("hof_pie.html")
fig.show()

In [26]:
is_hof = by_tenure[by_tenure['hof'] == True]
is_not = by_tenure[by_tenure['hof'] == False]

In [27]:
counts_hof = is_hof['years_service'].value_counts().sort_index()
counts_not = is_not['years_service'].value_counts().sort_index()

In [28]:
labels = counts_hof.index
values = counts_hof.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values, sort = False, title = "Years Played by Hall of Famers")])
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.write_html("hof_pie_years.html")
fig.show()

In [29]:
labels = counts_not.index
values = counts_not.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values, sort = False, title = "Years Played by Non Hall of Famers")])
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.write_html("not_hof_pie_years.html")
fig.show()

In [30]:
idx = pd.Index(temp.index)
hof_inductees = idx.isin(hof_inducted['playerID'])
temp['hof'] = hof_inductees

In [31]:
temp['id'] = temp.index
fig = px.scatter(temp, x="H", y="AB", color="hof", hover_data=['id'], width=1000, height=600)
fig.write_html("H_AB_scatter.html")
fig.show()

In [32]:
fig = px.scatter(temp, x="Avg", y="H", color="hof", hover_data=['id'], width=1000, height=600)
fig.write_html("H_AVG_scatter.html")
fig.show()

In [33]:
by_player_eligible = pd.get_dummies(by_player_eligible, columns=['POS'])
by_player_eligible

Unnamed: 0_level_0,G,R,H,2B,3B,HR,RBI,SB,BB,SO,...,throws_L,throws_R,Avg,hof,POS_1B,POS_2B,POS_3B,POS_C,POS_OF,POS_SS
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,3298,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,...,0,1,0.344007,True,0,0,0,0,1,0
abbated01,855,355,772,99,43,11,324.0,142.0,289,283.0,...,0,1,0.280218,False,0,0,0,0,0,1
abbotku01,702,273,523,109,23,62,242.0,22.0,133,571.0,...,0,1,0.273679,False,0,0,0,0,0,1
abramca01,567,257,433,64,19,32,138.0,12.0,304,290.0,...,1,0,0.331293,False,0,0,0,0,1,0
abreubo01,2425,1453,2470,574,59,288,1363.0,400.0,1476,1840.0,...,0,1,0.352656,False,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmedo01,1095,353,773,130,22,91,352.0,45.0,246,678.0,...,0,1,0.254527,False,0,0,0,0,0,1
zimmehe01,1456,695,1566,275,105,58,796.0,175.0,242,432.0,...,0,1,0.309364,False,0,0,0,0,0,1
zimmery01,1408,793,1505,338,20,215,829.0,41.0,539,1087.0,...,0,1,0.308591,False,0,0,0,0,0,1
ziskri01,1453,681,1477,245,26,207,792.0,8.0,533,910.0,...,0,1,0.320321,False,0,0,0,0,1,0


## TRAIN TEST SPLIT / STANDARDIZATION

In [34]:
temp = by_player_eligible.copy()
labels = temp['hof']
temp = temp.drop(columns = ['hof'])
features = temp
features

Unnamed: 0_level_0,G,R,H,2B,3B,HR,RBI,SB,BB,SO,...,bats_R,throws_L,throws_R,Avg,POS_1B,POS_2B,POS_3B,POS_C,POS_OF,POS_SS
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,3298,2174,3771,624,98,755,2297.0,240.0,1402,1383.0,...,1,0,1,0.344007,0,0,0,0,1,0
abbated01,855,355,772,99,43,11,324.0,142.0,289,283.0,...,1,0,1,0.280218,0,0,0,0,0,1
abbotku01,702,273,523,109,23,62,242.0,22.0,133,571.0,...,1,0,1,0.273679,0,0,0,0,0,1
abramca01,567,257,433,64,19,32,138.0,12.0,304,290.0,...,0,1,0,0.331293,0,0,0,0,1,0
abreubo01,2425,1453,2470,574,59,288,1363.0,400.0,1476,1840.0,...,0,0,1,0.352656,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmedo01,1095,353,773,130,22,91,352.0,45.0,246,678.0,...,1,0,1,0.254527,0,0,0,0,0,1
zimmehe01,1456,695,1566,275,105,58,796.0,175.0,242,432.0,...,1,0,1,0.309364,0,0,0,0,0,1
zimmery01,1408,793,1505,338,20,215,829.0,41.0,539,1087.0,...,1,0,1,0.308591,0,0,0,0,0,1
ziskri01,1453,681,1477,245,26,207,792.0,8.0,533,910.0,...,1,0,1,0.320321,0,0,0,0,1,0


In [35]:
X = features
y = np.ravel(labels)

# shuffle by default is true, but it important because the dataframe is sorted by year
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=24, shuffle = True)

In [36]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# define the cross valdiator that will be used uniformly across all classifiers
# for each trial
cv = RepeatedKFold(n_splits = 3, n_repeats = 3)

In [37]:
# Code adapted from:
# https://stackoverflow.com/questions/60860121/plotly-how-to-make-an-annotated-confusion-matrix-using-a-heatmap

def plotly_conf_mtx(y_test, y_predict, label):

    conf_mx = confusion_matrix(y_test, y_predict)
    conf_mx_prop = conf_mx / np.bincount(y_test)

    conf_mx_prop = conf_mx_prop[::-1]

    x = ['NOT HOF', 'HOF']
    y =  ['HOF', 'NOT HOF']

    z_text = [[str(prop) for prop in arr] for arr in conf_mx_prop]

    fig = ff.create_annotated_heatmap(conf_mx_prop, x=x, y=y, annotation_text=z_text, colorscale='Viridis', showscale = False)

    fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

    fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.15,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

    fig.update_layout(margin=dict(t=50, l=200))

    fig['data'][0]['showscale'] = True

    fig.write_html(label)
    fig.show()

## KNN 

In [38]:
#define model 
model = KNeighborsClassifier()
# define search space
k_range = list(range(1, 13, 2))
space = dict()
space['weights'] = ['uniform', 'distance']
space['n_neighbors'] = k_range
#define search 
search = GridSearchCV(model, space, scoring = 'accuracy', n_jobs = -1, cv = cv)
#execute search 
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9414741474147414
Best Hyperparameters: {'n_neighbors': 7, 'weights': 'uniform'}
Best Estimator: KNeighborsClassifier(n_neighbors=7)


In [39]:
# train on all of taining data and use best hyperparameters to evaluate accuracy 
# on test set
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.9410977242302544


In [40]:
#confusion matrix for K-NN
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'knn_acc.html')

## KNN- Recall

In [41]:
#define model 
model = KNeighborsClassifier()
# define search space
k_range = list(range(1, 13, 2))
space = dict()
space['weights'] = ['uniform', 'distance']
space['n_neighbors'] = k_range
#define search 
search = GridSearchCV(model, space, scoring = 'recall', n_jobs = -1, cv = cv)
#execute search 
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.3506284830756235
Best Hyperparameters: {'n_neighbors': 1, 'weights': 'uniform'}
Best Estimator: KNeighborsClassifier(n_neighbors=1)


In [42]:
# train on all of taining data and use best hyperparameters to evaluate accuracy 
# on test set
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.recall_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.3404255319148936


In [43]:
#confusion matrix for K-NN
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'knn_recall.html')

## SVM

In [44]:
# define search space
model = svm.SVC()
# define search space
space = dict()
space['C'] = [0.001, 0.01, 0.1, 1, 10, 100]
space['gamma'] = [0.0001, 0.001, 0.01, 0.1]
space['kernel'] = ['rbf', 'linear']
#define search:
search = GridSearchCV(model, space, scoring = 'accuracy', n_jobs = -1, cv = cv)
#execute search
result = search.fit(X_train, y_train)
#summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9511551155115511
Best Hyperparameters: {'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
Best Estimator: SVC(C=1, gamma=0.0001, kernel='linear')


In [45]:
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.9531459170013387


In [46]:
#confusion matrix for SVM
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'svm_acc.html')

## SVM- Recall

In [47]:
# define search space
model = svm.SVC()
# define search space
space = dict()
space['C'] = [0.001, 0.01, 0.1, 1, 10, 100]
space['gamma'] = [0.0001, 0.001, 0.01, 0.1]
space['kernel'] = ['rbf', 'linear']
#define search:
search = GridSearchCV(model, space, scoring = 'recall', n_jobs = -1, cv = cv)
#execute search
result = search.fit(X_train, y_train)
#summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.5350951201828394
Best Hyperparameters: {'C': 10, 'gamma': 0.0001, 'kernel': 'linear'}
Best Estimator: SVC(C=10, gamma=0.0001, kernel='linear')


In [48]:
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.recall_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.44680851063829785


In [49]:
#confusion matrix for SVM
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'svm_recall.html')

## NEURAL NETWORK

In [50]:
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

/usr/bin/sh: conda: command not found


In [51]:
model = Sequential()
model.add(Dense(16, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

es = EarlyStopping(monitor='val_accuracy', mode='max', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, callbacks=[es], epochs=80, batch_size=10,validation_split=0.2,shuffle=True,verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                384       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 401
Trainable params: 401
Non-trainable params: 0
_________________________________________________________________


2022-05-02 00:33:33.116581: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-02 00:33:33.116967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 00:33:33.117154: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-05-02 00:33:33.214617: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-05-02 00:33:33.215087: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2499995000 Hz


Epoch 1/80


  1/122 [..............................] - ETA: 1:49 - loss: 0.6935 - accuracy: 0.7000

 11/122 [=>............................] - ETA: 0s - loss: 0.6292 - accuracy: 0.7437  

 21/122 [====>.........................] - ETA: 0s - loss: 0.5974 - accuracy: 0.7599















Epoch 2/80
  1/122 [..............................] - ETA: 2s - loss: 0.1475 - accuracy: 1.0000

 10/122 [=>............................] - ETA: 0s - loss: 0.2932 - accuracy: 0.9266

 17/122 [===>..........................] - ETA: 0s - loss: 0.2957 - accuracy: 0.9294

 25/122 [=====>........................] - ETA: 0s - loss: 0.2899 - accuracy: 0.9312







































Epoch 3/80
  1/122 [..............................] - ETA: 1s - loss: 0.1106 - accuracy: 1.0000

  4/122 [..............................] - ETA: 2s - loss: 0.1372 - accuracy: 0.9604

  9/122 [=>............................] - ETA: 1s - loss: 0.1662 - accuracy: 0.9378

 11/122 [=>............................] - ETA: 2s - loss: 0.1697 - accuracy: 0.9352

 12/122 [=>............................] - ETA: 2s - loss: 0.1708 - accuracy: 0.9350

 17/122 [===>..........................] - ETA: 2s - loss: 0.1763 - accuracy: 0.9333

 25/122 [=====>........................] - ETA: 1s - loss: 0.1772 - accuracy: 0.9357

















Epoch 4/80
  1/122 [..............................] - ETA: 0s - loss: 0.1450 - accuracy: 1.0000











Epoch 5/80
  1/122 [..............................] - ETA: 0s - loss: 0.1781 - accuracy: 0.9000

 24/122 [====>.........................] - ETA: 0s - loss: 0.1536 - accuracy: 0.9405









Epoch 6/80
  1/122 [..............................] - ETA: 1s - loss: 0.1328 - accuracy: 1.0000











Epoch 7/80
  1/122 [..............................] - ETA: 1s - loss: 0.1742 - accuracy: 0.9000

 21/122 [====>.........................] - ETA: 0s - loss: 0.1724 - accuracy: 0.9374























Epoch 8/80
  1/122 [..............................] - ETA: 2s - loss: 0.2711 - accuracy: 0.9000

 26/122 [=====>........................] - ETA: 0s - loss: 0.1969 - accuracy: 0.9044













Epoch 9/80
  1/122 [..............................] - ETA: 1s - loss: 0.2076 - accuracy: 0.8000

 11/122 [=>............................] - ETA: 0s - loss: 0.1064 - accuracy: 0.9285















Epoch 10/80
  1/122 [..............................] - ETA: 2s - loss: 0.3063 - accuracy: 0.9000

 20/122 [===>..........................] - ETA: 0s - loss: 0.1087 - accuracy: 0.9638



















Epoch 11/80
  1/122 [..............................] - ETA: 1s - loss: 0.0510 - accuracy: 1.0000

 28/122 [=====>........................] - ETA: 0s - loss: 0.0729 - accuracy: 0.9796











Epoch 12/80
  1/122 [..............................] - ETA: 0s - loss: 0.0431 - accuracy: 1.0000

 11/122 [=>............................] - ETA: 0s - loss: 0.1381 - accuracy: 0.9293













In [52]:
y_predict = model.predict(X_test)
y_predict = y_predict.round().flatten().astype(int)
accuracy = metrics.accuracy_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.9330655957161981


In [53]:
#confusion matrix for Neural Network
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'nn_acc.html')

## LOGISTIC REGRESSION

In [54]:
from sklearn.linear_model import LogisticRegression

#define model
model = LogisticRegression(max_iter = 1000)
#define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = [0.0, 'l1', 'l2']
space['C'] = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
#define search
search = GridSearchCV(model, space, scoring = 'accuracy', n_jobs = -1, cv= cv, error_score = 0.0)
# execute search
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9526952695269526
Best Hyperparameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Best Estimator: LogisticRegression(max_iter=1000, solver='newton-cg')


In [55]:
# train on all of taining data and use best hyperparameters to evaluate accuracy 
# on test set
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.9544846050870147


In [56]:
#confusion matrix for Log. Regression
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'lr_acc.html')

## LOGISTIC REGRESSION- Recall

In [57]:
#define model
model = LogisticRegression(max_iter = 1000)
#define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = [0.0, 'l1', 'l2']
space['C'] = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
#define search
search = GridSearchCV(model, space, scoring = 'recall', n_jobs = -1, cv= cv, error_score = 0.0)
# execute search
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9111962644566385
Best Hyperparameters: {'C': 1e-08, 'penalty': 'l2', 'solver': 'liblinear'}
Best Estimator: LogisticRegression(C=1e-08, max_iter=1000, solver='liblinear')


In [58]:
# train on all of taining data and use best hyperparameters to evaluate accuracy 
# on test set
optimalmodel = result.best_estimator_
optimalmodel.fit(X_train, y_train)
y_predict = optimalmodel.predict(X_test)
accuracy = metrics.recall_score(y_test, y_predict)
print('accuracy on test set: %s' % accuracy)

accuracy on test set: 0.8936170212765957


In [59]:
plotly_conf_mtx(y_test = y_test, y_predict = y_predict, label = 'lr_recall.html')