# Notebook Contents

- [Imports](#Imports)
- [Data](#Data)
- [Modeling](#Modeling)
    - [4-Seam Fastball](#4-Seam-Fastball)
    - [Slider](#Slider)
    - [Changeup](#Changeup)
    - [Cutter](#Cutter)

# Imports

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, k_means
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.feature_selection import SelectKBest
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold

import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
pd.set_option('max_columns', None)

data = pd.read_csv('../data/arsenal-spin.csv')
data.drop(columns = ['Unnamed: 0', 'player_id', 'pitches', 'n_pitches'], inplace = True)
data = data.sort_values(by = ['last_name_x'], ascending = True)

# Binarize pitch types
data['pitch_type'] = data['pitch_type'].map({'FS': 5, 'FC': 4, 'CH': 3, 'SL': 2, 'FF': 1})

data = pd.get_dummies(data, columns = ['pitch_hand'])

# Index by Pitch Name
ff = data.loc[data['pitch_name_y'] == '4-Seam Fastball']
sl = data.loc[data['pitch_name_y'] == 'Slider']
ch = data.loc[data['pitch_name_y'] == 'Changeup']
fc = data.loc[data['pitch_name_y'] == 'Cutter']
fs = data.loc[data['pitch_name_y'] == 'Splitter']

print(data.shape)
data.head()

(869, 43)


Unnamed: 0,last_name_x,last_name_y,first_name_x,pitch_type,pitch_name_x,pitch_name_y,pitch_usage,release_speed,spin_rate,movement_inches,spin_eff%,alan_active_spin_pct,active_spin,hawkeye_measured,movement_inferred,diff_measured_inferred,diff2,run_value_per_100,run_value,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent,diff_measured_inferred_minutes,hawkeye_measured_clock_hh,hawkeye_measured_clock_mm,movement_inferred_clock_hh,movement_inferred_clock_mm,diff_clock_hh,diff_clock_mm,hawkeye_measured_clock_label,movement_inferred_clock_label,diff_clock_label,team_name_alt,pitch_hand_L,pitch_hand_R
0,Abreu,Abreu,Bryan,2,Slider,Slider,37.2,88.9,2526,6.2,32,0.269459,0.324769,316.257702,314.734584,1.523118,1.523118,1.9,5,63,0.246,0.456,0.338,38.0,28.6,22.5,0.253,0.423,0.332,45.0,0,7,30,7,30,0,0,7:30,7:30,0H 00M,HOU,0,1
1,Abreu,Abreu,Bryan,1,4-Seamer,4-Seam Fastball,45.0,95.7,2216,18.6,93,0.881274,0.925214,149.995293,153.032372,-3.037079,3.037079,-0.1,0,66,0.273,0.418,0.379,23.9,12.1,11.4,0.274,0.428,0.367,34.0,0,1,0,1,0,0,0,1:00,1:00,0H 00M,HOU,0,1
2,Adams,Adams,Austin,2,Slider,Slider,87.1,86.9,2837,10.8,37,0.456257,0.370953,291.664055,273.337675,18.32638,18.32638,-0.8,-7,217,0.145,0.208,0.288,35.3,33.2,28.7,0.158,0.256,0.307,23.7,-45,8,15,9,0,0,45,8:15,9:00,-0H 45M,SD,0,1
3,Akin,Akin,Keegan,3,Changeup,Changeup,19.5,81.9,1936,17.4,95,0.830559,0.953193,236.40719,246.986739,-10.579549,10.579549,2.8,9,100,0.333,0.494,0.405,29.1,16.0,17.6,0.248,0.425,0.334,38.9,15,10,0,9,45,0,15,10:00,9:45,+0H 15M,BAL,1,0
4,Akin,Akin,Keegan,1,4-Seamer,4-Seam Fastball,57.2,92.0,2337,20.7,97,0.955159,0.969171,216.480606,212.79452,3.686086,3.686086,-0.6,-6,231,0.25,0.451,0.331,20.0,22.1,14.2,0.253,0.487,0.353,43.9,-15,10,45,11,0,0,15,10:45,11:00,-0H 15M,BAL,1,0


# Modeling

### 4-Seam Fastball

In [3]:
features_ff = ff.select_dtypes([np.number])

X_ff = features_ff

ss = StandardScaler()
X_ff_scaled = ss.fit_transform(X_ff)
X_ff_scaled[:,:] *= -1

pca = PCA().fit(X_ff_scaled)
principal_components_ff = pca.transform(X_ff_scaled)

model_ff = pd.DataFrame(data = principal_components_ff, columns = X_ff.columns)

print(model_ff.shape)
model_ff.head()

(389, 34)


Unnamed: 0,pitch_type,pitch_usage,release_speed,spin_rate,movement_inches,spin_eff%,alan_active_spin_pct,active_spin,hawkeye_measured,movement_inferred,diff_measured_inferred,diff2,run_value_per_100,run_value,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent,diff_measured_inferred_minutes,hawkeye_measured_clock_hh,hawkeye_measured_clock_mm,movement_inferred_clock_hh,movement_inferred_clock_mm,diff_clock_hh,diff_clock_mm,pitch_hand_L,pitch_hand_R
0,0.856596,1.340717,-1.788463,2.185354,0.255932,-0.441654,-0.483339,-1.895243,0.222381,0.232552,0.255435,0.540668,1.176417,0.323046,0.492821,0.707334,-0.271559,-0.712567,0.156649,0.59741,0.428784,0.070569,-0.129412,-0.339766,-0.017933,0.356661,0.034832,0.198035,-0.098744,-0.119254,0.040693,-2.0816680000000002e-17,-2.775888e-16,0.0
1,-0.203892,-4.114825,-1.043042,-0.047261,0.678314,-0.72701,0.432329,0.762623,1.396601,1.290409,0.709666,0.275063,-0.950907,0.497314,0.213387,-0.027957,0.42499,-0.762939,0.211489,-0.1094,-0.197481,0.028794,-0.68155,0.112422,0.189486,0.172447,-0.463725,-0.01005,-0.048539,0.154585,0.006717,-8.881784e-16,2.3619070000000003e-17,-4.440892e-16
2,-0.521762,1.652226,-2.580068,0.722258,-0.548206,-0.123597,-0.018856,-1.055081,0.206431,-0.885627,-0.572866,0.625059,-0.061444,1.027421,0.418261,0.460054,-0.052067,0.014003,-0.350798,0.598165,-0.144419,-0.386673,-0.051428,-0.013376,-0.453783,-0.053838,-0.149008,-0.078508,0.013957,0.006488,0.031853,-2.775558e-17,-7.893564e-17,-3.885781e-16
3,-0.763184,2.782771,-0.691527,0.968392,-0.592907,-0.164885,-0.543129,0.081023,0.637651,-0.402112,-1.40666,-0.871869,-0.006937,1.875921,-0.807315,-0.196923,0.68936,0.079387,0.0604,0.457272,-0.255166,-0.192956,0.12365,0.377465,-0.098482,0.040316,-0.151986,-0.008004,-0.060117,-0.068214,0.026926,-6.245005e-17,-3.030006e-17,-3.330669e-16
4,0.802166,-3.28621,1.483264,2.095716,0.226129,-0.276895,-1.362696,0.622928,-0.031649,-1.62232,0.538499,0.608905,-0.775703,0.172831,-0.416059,0.375214,-0.310333,0.1058,-0.097214,-0.071362,-0.344703,-0.054616,0.34071,0.032786,-0.227948,-0.080931,-0.134063,-0.486685,0.094026,-0.087732,-0.029802,-2.914335e-16,6.876298e-17,2.220446e-16


In [4]:
km_ff = KMeans(n_clusters = 3, 
               random_state = 1)

km_ff.fit(model_ff)
label_ff = km_ff.fit_predict(model_ff)

print('Number of iterations:', km_ff.n_iter_)
print('Number of features:', km_ff.n_features_in_)
print('Number of clusters:', km_ff.n_clusters)
print('Inertia:', km_ff.inertia_)

print("Predicted clusters to points: ", label_ff[:10], '\n')
centroids_ff = km_ff.cluster_centers_
#print('Location of centroids:', '\n')
#print(centroids_ff)

Number of iterations: 10
Number of features: 34
Number of clusters: 3
Inertia: 9371.3327490369
Predicted clusters to points:  [2 0 1 1 0 1 0 0 2 2] 



In [5]:
X_ff['label'] = label_ff
model_ff['label'] = label_ff

#model_ff.groupby(by = 'label').mean().T
X_ff.groupby(by = 'label').mean().T

label,0,1,2
pitch_type,1.0,1.0,1.0
pitch_usage,48.747115,48.554605,43.581203
release_speed,92.860577,94.647368,93.457143
spin_rate,2251.240385,2312.546053,2245.601504
movement_inches,17.825,18.048026,17.170677
spin_eff%,90.432692,90.092105,89.067669
alan_active_spin_pct,0.846968,0.851991,0.822677
active_spin,0.90441,0.901225,0.890461
hawkeye_measured,212.053948,148.946526,152.335483
movement_inferred,205.397475,154.542704,157.031254


### Slider

In [6]:
features_sl = sl.select_dtypes([np.number])
X_sl = features_sl

#ss = StandardScaler()
X_sl_scaled = ss.fit_transform(X_sl)
X_sl_scaled[:,:] *= -1

pca = PCA().fit(X_sl_scaled)
principal_components_sl = pca.transform(X_sl_scaled)

model_sl = pd.DataFrame(data = principal_components_sl, columns = X_sl.columns)

print(model_sl.shape)
model_sl.head()

(256, 34)


Unnamed: 0,pitch_type,pitch_usage,release_speed,spin_rate,movement_inches,spin_eff%,alan_active_spin_pct,active_spin,hawkeye_measured,movement_inferred,diff_measured_inferred,diff2,run_value_per_100,run_value,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent,diff_measured_inferred_minutes,hawkeye_measured_clock_hh,hawkeye_measured_clock_mm,movement_inferred_clock_hh,movement_inferred_clock_mm,diff_clock_hh,diff_clock_mm,pitch_hand_L,pitch_hand_R
0,2.727047,-1.351232,0.506166,2.649978,-0.17193,1.04237,0.275752,1.267546,0.850492,-0.429238,-0.4713,-0.213581,1.282021,0.484069,0.908282,0.836646,-0.108048,-0.43909,-0.340727,-0.464762,-0.350862,-0.000115,0.193435,-0.071215,0.343935,0.161454,-0.137789,0.097621,-0.10604,-0.013216,-0.065235,-0.016003,-5.5511150000000004e-17,-3.640861e-16
1,-2.712714,-1.470881,0.025705,2.015258,1.556182,2.501167,-1.234353,-1.965534,-1.306039,2.387769,1.099611,-1.56369,-0.953707,0.454017,-0.32568,0.139344,-0.632022,-0.939361,-0.54246,-0.539368,0.277763,0.275088,0.834484,0.53474,0.020247,0.007672,0.012383,-0.143261,-0.065224,0.074001,0.014938,0.003214,-5.5511150000000004e-17,-9.098883e-18
2,7.526912,2.740727,-2.853085,-0.104024,1.063859,-0.030397,0.027702,1.031848,0.651409,0.064279,0.238541,-0.459642,0.042922,-0.63287,0.652151,1.354211,0.523198,0.00932,0.085883,-0.103142,-0.227055,0.250331,-0.152353,0.114664,0.032782,0.043794,-0.192138,0.154368,-0.092385,-0.021403,-0.01851,0.001909,2.220446e-15,-1.388075e-16
3,-0.930256,0.172649,3.436298,-1.398519,-0.803078,0.606567,0.408094,-1.478306,0.242651,-0.276902,-0.44602,-0.842371,0.555233,0.972617,-0.264374,0.0107,0.395185,0.435143,-0.289037,-0.076274,0.343451,-0.253044,-0.168033,0.004464,0.100337,0.030382,0.081841,0.076797,-0.045612,0.027706,-0.002869,0.004938,-1.110223e-16,7.528465e-18
4,-0.104662,0.628768,2.512368,2.045119,1.450783,-0.28147,-0.171846,0.037584,-1.842096,-0.819995,-0.813191,1.643472,-2.098564,1.638904,-0.701812,0.466121,0.031437,0.0094,-0.13698,-0.201394,-0.280152,0.023372,0.619285,-0.4048,-0.039195,-0.049563,-0.063507,-0.237443,0.089673,0.125097,-0.043621,-0.007143,-3.330669e-15,-5.932387e-18


In [7]:
km_sl = KMeans(n_clusters = 3, 
               random_state = 1)

km_sl.fit(model_sl)
label_sl = km_sl.fit_predict(model_sl)

print('Number of iterations:', km_sl.n_iter_)
print('Number of features:', km_sl.n_features_in_)
print('Number of clusters:', km_sl.n_clusters)
print('Inertia:', km_sl.inertia_)

print("Predicted clusters to points: ", label_sl[:10], '\n')
centroids_sl = km_sl.cluster_centers_
#print('Location of centroids:', '\n')
#print(centroids_sl)

Number of iterations: 12
Number of features: 34
Number of clusters: 3
Inertia: 6132.083503070577
Predicted clusters to points:  [0 2 1 2 0 1 0 2 0 0] 



In [8]:
X_sl['label'] = label_sl
model_sl['label'] = label_sl

#model_sl.groupby(by = 'label').mean().T
X_sl.groupby(by = 'label').mean().T

label,0,1,2
pitch_type,2.0,2.0,2.0
pitch_usage,28.926531,31.590769,35.865591
release_speed,85.27551,83.993846,84.434409
spin_rate,2357.020408,2352.769231,2543.354839
movement_inches,7.107143,7.189231,10.793548
spin_eff%,34.683673,31.523077,42.053763
alan_active_spin_pct,0.317956,0.317775,0.46411
active_spin,0.34717,0.315389,0.420828
hawkeye_measured,241.290674,101.53874,255.876564
movement_inferred,260.066584,92.886852,255.123142


### Changeup

In [9]:
features_ch = ch.select_dtypes([np.number])
X_ch = features_ch

#ss = StandardScaler()
X_ch_scaled = ss.fit_transform(X_ch)
X_ch_scaled[:,:] *= -1

pca = PCA().fit(X_ch_scaled)
principal_components_ch = pca.transform(X_ch_scaled)

model_ch = pd.DataFrame(data = principal_components_ch, columns = X_ch.columns)

print(model_ch.shape)
model_ch.head()

(124, 34)


Unnamed: 0,pitch_type,pitch_usage,release_speed,spin_rate,movement_inches,spin_eff%,alan_active_spin_pct,active_spin,hawkeye_measured,movement_inferred,diff_measured_inferred,diff2,run_value_per_100,run_value,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent,diff_measured_inferred_minutes,hawkeye_measured_clock_hh,hawkeye_measured_clock_mm,movement_inferred_clock_hh,movement_inferred_clock_mm,diff_clock_hh,diff_clock_mm,pitch_hand_L,pitch_hand_R
0,4.690199,0.071599,-0.375902,2.004539,0.218872,-0.123674,-0.095108,0.274127,0.164318,1.375344,-1.188705,0.325777,-0.274025,0.874615,0.23999,-0.689381,-0.087588,0.078313,-0.452773,-0.396227,-0.092833,-0.348526,-0.123407,0.594198,-0.189,0.114568,-0.050574,-0.001698,-0.016378,0.006707,2.220446e-16,1.110223e-16,1.526557e-16,-7.023371e-16
1,-2.954268,1.550785,0.561084,-0.110075,-0.19562,-0.896079,1.233411,-2.761548,2.464492,-0.784236,0.223305,0.125886,-2.358038,-0.2029,0.500354,0.373489,0.002382,-0.452377,-0.148535,0.18724,0.13509,-0.006355,-0.109225,-0.164667,-0.095159,-0.028501,0.012758,0.071974,0.00845,0.010052,-3.885781e-16,6.661338e-16,-2.726985e-15,1.209562e-16
2,3.437597,-1.365347,0.914747,-0.279363,-1.260586,0.703115,-1.689891,-0.678645,0.301197,-0.076795,-0.533173,0.627661,0.473043,0.234461,-0.080096,-1.11114,-0.087845,-0.369043,0.131498,0.222561,0.232451,-0.095519,-0.198729,-0.199054,-0.075533,-0.275142,-0.027859,0.104658,-0.005287,-0.005346,5.551115e-16,-3.330669e-16,-4.087009e-15,5.2703220000000004e-18
3,4.371432,-0.476504,-1.221066,-0.164327,-0.374254,-0.372572,0.690439,0.670693,-0.871594,-0.013378,0.320228,-0.834831,0.371654,-0.249847,-0.256475,0.034717,-0.226846,-0.312917,0.32101,-0.19341,0.249193,-0.026223,0.06074,0.029265,-0.035686,-0.097696,-0.242549,0.026522,0.000255,-0.001827,0.0,1.110223e-16,1.172673e-15,-2.249891e-17
4,3.235747,-1.182737,-1.137729,0.538632,1.507373,-0.654022,0.198373,-0.385365,-0.215521,0.993643,-0.068908,-0.976832,-0.167484,0.060295,0.046625,-0.340703,0.207508,0.7261,0.056234,0.083875,0.036802,0.665886,-0.0382,-0.165415,-0.056581,0.026048,-0.000374,0.009468,-0.013153,-0.007146,-5.5511150000000004e-17,0.0,3.115563e-15,1.4835700000000002e-17


In [10]:
km_ch = KMeans(n_clusters = 3, 
               random_state = 1)

km_ch.fit(model_ch)
label_ch = km_ch.fit_predict(model_ch)

print('Number of iterations:', km_ch.n_iter_)
print('Number of features:', km_ch.n_features_in_)
print('Number of clusters:', km_ch.n_clusters)
print('Inertia:', km_ch.inertia_)

print("Predicted clusters to points: ", label_ch[:10], '\n')
centroids_ch = km_ch.cluster_centers_
#print('Location of centroids:', '\n')
#print(centroids_ch)

Number of iterations: 6
Number of features: 34
Number of clusters: 3
Inertia: 2803.693814101709
Predicted clusters to points:  [0 2 0 0 0 0 2 2 1 0] 



In [11]:
X_ch['label'] = label_ch
model_ch['label'] = label_ch

#model_ch.groupby(by = 'label').mean().T
X_ch.groupby(by = 'label').mean().T

label,0,1,2
pitch_type,3.0,3.0,3.0
pitch_usage,22.698,23.54,27.105882
release_speed,83.786,84.915,85.626471
spin_rate,1742.24,1682.75,1852.205882
movement_inches,16.426,15.475,16.861765
spin_eff%,92.46,87.5,89.882353
alan_active_spin_pct,0.836649,0.810089,0.846443
active_spin,0.925386,0.875625,0.900406
hawkeye_measured,236.146771,121.688307,123.277921
movement_inferred,247.931598,104.501343,109.892733


### Cutter

In [12]:
features_fc = fc.select_dtypes([np.number])

X_fc = features_fc

#ss = StandardScaler()
X_fc_scaled = ss.fit_transform(X_fc)
X_fc_scaled[:,:] *= -1

pca = PCA().fit(X_fc_scaled)
principal_components_fc = pca.transform(X_fc_scaled)

model_fc = pd.DataFrame(data = principal_components_fc, columns = X_fc.columns)

print(model_fc.shape)
model_fc.head()

(80, 34)


Unnamed: 0,pitch_type,pitch_usage,release_speed,spin_rate,movement_inches,spin_eff%,alan_active_spin_pct,active_spin,hawkeye_measured,movement_inferred,diff_measured_inferred,diff2,run_value_per_100,run_value,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent,diff_measured_inferred_minutes,hawkeye_measured_clock_hh,hawkeye_measured_clock_mm,movement_inferred_clock_hh,movement_inferred_clock_mm,diff_clock_hh,diff_clock_mm,pitch_hand_L,pitch_hand_R
0,-0.461201,-2.943149,3.019625,-1.261817,-0.612532,0.887071,-0.517117,-0.773359,-0.664152,-1.578764,0.053864,0.437042,-0.929228,-0.982195,1.003094,0.229593,0.823029,-0.871284,-0.192897,-0.264408,0.246727,0.160095,-0.011954,0.278214,0.252268,0.167447,-0.024487,-0.048262,0.100559,0.007696,0.010464,-2.359224e-16,0.0,3.062442e-15
1,-3.801902,-3.809852,0.246051,-0.151972,-0.383449,0.705085,-1.684227,-0.375585,1.008889,-1.121513,0.644968,-0.71877,-1.048268,1.109714,0.649491,-0.513572,0.741193,-0.552911,0.209136,-0.119167,-0.009563,-0.037183,0.182432,-0.353337,0.168551,-0.019755,0.070021,-0.05998,0.044145,-0.034766,0.003488,-2.650657e-15,2.220446e-16,7.176734e-15
2,0.124546,-1.969866,2.39787,-2.047887,-0.741633,0.014989,0.445463,-0.541735,-0.323229,0.794135,-0.336902,1.300747,0.931848,-0.364012,-0.904957,-0.475475,-0.417932,0.021193,0.656889,-0.22344,0.183174,-0.405568,-0.154583,0.082818,-0.119427,0.065355,0.06176,0.089931,-0.049993,-0.008916,0.018362,1.290634e-15,-1.110223e-16,-4.179584e-15
3,-0.526301,-0.084616,-3.982962,-1.26544,0.387415,0.595274,0.899824,-1.233967,-1.383758,-0.161684,-1.390584,-1.216565,0.242607,-0.628509,0.427086,-0.545322,-0.031937,0.551644,0.195568,-0.558116,0.368975,-0.222908,-0.267575,0.040838,-0.485451,0.029759,-0.158077,-0.18601,-0.049821,-0.016666,-0.00256,1.061651e-15,5.551115e-16,-3.543073e-16
4,0.912096,-0.036857,-2.54916,-1.532554,1.535447,-0.362737,-0.39512,-0.637934,-0.835794,-0.326557,1.337252,-1.116545,0.117806,0.18075,0.106525,-0.155081,0.016574,0.33303,0.137904,-0.426711,0.080169,0.232295,0.05797,0.234934,-0.4233,0.26098,0.086982,0.007468,-0.046445,-0.017987,0.015813,2.414735e-15,3.330669e-16,-3.148513e-15


In [13]:
km_fc = KMeans(n_clusters = 3, 
               random_state = 1)

km_fc.fit(model_fc)
label_fc = km_fc.fit_predict(model_fc)

print('Number of iterations:', km_fc.n_iter_)
print('Number of features:', km_fc.n_features_in_)
print('Number of clusters:', km_fc.n_clusters)
print('Inertia:', km_fc.inertia_)

print("Predicted clusters to points: ", label_fc[:10], '\n')
centroids_fc = km_fc.cluster_centers_
#print('Location of centroids:', '\n')
#print(centroids_fc)

Number of iterations: 6
Number of features: 34
Number of clusters: 3
Inertia: 1899.6005890701556
Predicted clusters to points:  [1 1 1 2 0 2 0 0 1 0] 



In [14]:
X_fc['label'] = label_fc
model_fc['label'] = label_fc

#model_fc.groupby(by = 'label').mean().T
X_fc.groupby(by = 'label').mean().T

label,0,1,2
pitch_type,4.0,4.0,4.0
pitch_usage,34.202941,33.478947,27.562963
release_speed,89.261765,86.473684,88.744444
spin_rate,2476.352941,2276.315789,2294.962963
movement_inches,8.135294,7.578947,9.666667
spin_eff%,42.647059,43.842105,55.407407
alan_active_spin_pct,0.366735,0.348385,0.444431
active_spin,0.426936,0.439062,0.554136
hawkeye_measured,190.02856,183.369307,184.377515
movement_inferred,218.998196,152.755431,191.258121
