# Notebook Contents

- [Imports](#Imports)
- [Data](#Data)
- [Features](#Features)
- [Clustering](#Clustering)
    - [4-Seam](#4-Seam-Fastball)
    - [Cutter](#Cutter)
    - [Sinker](#Sinker)
    - [Slider](#Slider)
    - [Curveball](#Curveball)
    - [Changeup](#Changeup)

# Imports

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, k_means
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
data = pd.read_csv('../data/model-pitches-rv.csv', index_col = [0])

pd.set_option('max_columns', None)
print(data.shape)
data.head(5)

(705434, 65)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_-x,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,pitch_count,stand,bb_type,description,events,hit_distance_sc,exit_velo,launch_angle,woba_value,woba_denom,xwoba,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up,arm_angle,slot,is_strike,is_ball,final_pitch_ab,out_to_end_inning,home_runs,away_runs,runs,re,re_change,re_end_state,re24,lin_weight_above_avg,lin_weight_above_outs,woba_scale,lin_weights_above_avg_scale,lin_weights_above_outs_scale,woba,wraa_change,rv
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,-1.4,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,1,2,1-2,R,ground_ball,hit_into_play,field_out,13.0,95.2,-13.0,0.0,1.0,0.158,61,4,9,0,5,0,5,0,0,0,0,2,36.0,1,1,0,1,1,0,0,0,0.11,-0.11,0.0,-0.11,-0.271,0.0,1.209,-0.328,0.0,0.222,-0.152,-0.11
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,-1.6,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,1,1,1-1,R,,strike,,108.0,75.3,75.0,,,,61,3,9,0,5,0,5,0,0,0,0,2,45.8,2,1,0,0,0,0,0,0,0.11,0.0,0.11,0.0,,,1.209,0.0,0.0,0.3,-0.053,0.0
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,-1.46,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,1,0,1-0,R,,strike,,157.0,83.5,65.0,,,,61,2,9,0,5,0,5,0,0,0,0,2,38.4,1,1,0,0,0,0,0,0,0.11,0.0,0.11,0.0,,,1.209,0.0,0.0,0.356,-0.038,0.0
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,-1.53,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0,0,0-0,R,,ball,,,,,,,,61,1,9,0,5,0,5,0,0,0,0,2,38.4,1,0,1,0,0,0,0,0,0.11,0.0,0.11,0.0,,,1.209,0.0,0.0,0.314,0.028,0.0
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,-1.49,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,1,0,1-0,L,ground_ball,hit_into_play,field_out,9.0,93.3,-18.0,0.0,1.0,0.09,60,2,9,0,5,0,5,0,0,0,0,1,36.0,1,1,0,1,0,0,0,0,0.29,-0.18,0.11,-0.18,-0.271,0.0,1.209,-0.328,0.0,0.356,-0.038,-0.18


Pitch Types:

4-Seam, Cutter, Sinker, Slider, Curveball, Changeup

# Features

In [3]:
features = data[['pitch_type', 'p_throws', 'velo', 'spin_rate', 'pfx_-x', 'pfx_z', 'release_extension', 
                 'release_pos_x', 'release_pos_z', 'rv']]

In [4]:
ff = features.loc[features['pitch_type'] == 'FF']
print('4-Seam shape:', ff.shape)
ff_r = features.loc[(features['pitch_type'] == 'FF') & (features['p_throws'] == 'R')]
print('RHP 4-Seam shape:', ff_r.shape)
ff_l = features.loc[(features['pitch_type'] == 'FF') & (features['p_throws'] == 'L')]
print('LHP 4-Seam shape:', ff_l.shape, '\n')
fc = features.loc[features['pitch_type'] == 'FC']
print('Cutter shape:', fc.shape)
fc_r = features.loc[(features['pitch_type'] == 'FC') & (features['p_throws'] == 'R')]
print('RHP Cutter shape:', fc_r.shape)
fc_l = features.loc[(features['pitch_type'] == 'FC') & (features['p_throws'] == 'L')]
print('LHP Cutter shape:', fc_l.shape, '\n')
si = features.loc[features['pitch_type'] == 'SI']
print('Sinker shape:', si.shape)
si_r = features.loc[(features['pitch_type'] == 'SI') & (features['p_throws'] == 'R')]
print('RHP Sinker shape:', si_r.shape)
si_l = features.loc[(features['pitch_type'] == 'SI') & (features['p_throws'] == 'L')]
print('LHP Sinker shape:', si_l.shape, '\n')
sl = features.loc[features['pitch_type'] == 'SL']
print('Slider shape:', sl.shape)
sl_r = features.loc[(features['pitch_type'] == 'SL') & (features['p_throws'] == 'R')]
print('RHP Slider shape:', sl_r.shape)
sl_l = features.loc[(features['pitch_type'] == 'SL') & (features['p_throws'] == 'L')]
print('LHP Slider shape:', sl_l.shape, '\n')
cu = features.loc[features['pitch_type'] == 'CU']
print('Curveball shape:', cu.shape)
cu_r = features.loc[(features['pitch_type'] == 'CU') & (features['p_throws'] == 'R')]
print('RHP Curveball shape:', cu_r.shape)
cu_l = features.loc[(features['pitch_type'] == 'CU') & (features['p_throws'] == 'L')]
print('LHP Curveball shape:', cu_l.shape, '\n')
ch = features.loc[features['pitch_type'] == 'CH']
print('Changeup shape:', ch.shape)
ch_r = features.loc[(features['pitch_type'] == 'CH') & (features['p_throws'] == 'R')]
print('RHP Changeup shape:', ch_r.shape)
ch_l = features.loc[(features['pitch_type'] == 'CH') & (features['p_throws'] == 'L')]
print('LHP Changeup shape:', ch_l.shape)

4-Seam shape: (249682, 10)
RHP 4-Seam shape: (176387, 10)
LHP 4-Seam shape: (73295, 10) 

Cutter shape: (47445, 10)
RHP Cutter shape: (31183, 10)
LHP Cutter shape: (16262, 10) 

Sinker shape: (109151, 10)
RHP Sinker shape: (75681, 10)
LHP Sinker shape: (33470, 10) 

Slider shape: (135534, 10)
RHP Slider shape: (100754, 10)
LHP Slider shape: (34780, 10) 

Curveball shape: (58770, 10)
RHP Curveball shape: (39992, 10)
LHP Curveball shape: (18778, 10) 

Changeup shape: (80338, 10)
RHP Changeup shape: (50661, 10)
LHP Changeup shape: (29677, 10)


# Clustering

## 4-Seam

### RHP

In [5]:
features_ff_r = ff_r.select_dtypes([np.number])
X_ff_r = features_ff_r

ss = StandardScaler()
X_ff_r_scaled = ss.fit_transform(X_ff_r)
X_ff_r_scaled[:,:] *= -1

pca_ff_r = PCA().fit_transform(X_ff_r_scaled)
model_ff_r = pd.DataFrame(data = pca_ff_r, columns = X_ff_r.columns)

km_ff_r = KMeans(n_clusters = 2, 
               random_state = 1)
km_ff_r.fit(model_ff_r)
label_ff_r = km_ff_r.fit_predict(model_ff_r)

print('Number of iterations:', km_ff_r.n_iter_)
print('Number of features:', km_ff_r.n_features_in_)
print('Number of clusters:', km_ff_r.n_clusters)
print('Inertia:', km_ff_r.inertia_, '\n')
print('Predicted clusters to points: ', label_ff_r[:10])

Number of iterations: 11
Number of features: 8
Number of clusters: 2
Inertia: 1213894.3865452572 

Predicted clusters to points:  [0 0 0 1 1 1 1 1 1 1]


### LHP

In [6]:
features_ff_l = ff_l.select_dtypes([np.number])
X_ff_l = features_ff_l

ss = StandardScaler()
X_ff_l_scaled = ss.fit_transform(X_ff_l)
X_ff_l_scaled[:,:] *= -1

pca_ff_l = PCA().fit_transform(X_ff_l_scaled)
model_ff_l = pd.DataFrame(data = pca_ff_l, columns = X_ff_l.columns)

km_ff_l = KMeans(n_clusters = 2, 
               random_state = 1)
km_ff_l.fit(model_ff_l)
label_ff_l = km_ff_l.fit_predict(model_ff_l)

print('Number of iterations:', km_ff_l.n_iter_)
print('Number of features:', km_ff_l.n_features_in_)
print('Number of clusters:', km_ff_l.n_clusters)
print('Inertia:', km_ff_l.inertia_, '\n')
print('Predicted clusters to points: ', label_ff_l[:10])

Number of iterations: 13
Number of features: 8
Number of clusters: 2
Inertia: 501451.47215650795 

Predicted clusters to points:  [1 1 1 1 1 1 1 1 1 1]


### Cluster Labels - RHP 4-Seam

In [7]:
X_ff_r['label'] = label_ff_r
model_ff_r['label'] = label_ff_r

print(X_ff_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 93.884496    94.231525
spin_rate          2221.410641  2331.288823
pfx_-x                8.920628     5.759854
pfx_z                14.051447    17.635370
release_extension     6.401837     6.394756
release_pos_x        -2.299302    -1.378883
release_pos_z         5.617284     6.095369
rv                    0.009575     0.002635


### Cluster Labels - LHP 4-Seam

In [8]:
X_ff_l['label'] = label_ff_l
model_ff_l['label'] = label_ff_l

print(X_ff_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 92.507488    92.996651
spin_rate          2176.299910  2302.765937
pfx_-x               -9.293286    -6.190213
pfx_z                14.341496    17.518691
release_extension     6.394253     6.221770
release_pos_x         2.341104     1.564646
release_pos_z         5.705273     6.194861
rv                    0.007559     0.003181


## Cutter

### RHP

In [9]:
features_fc_r = fc_r.select_dtypes([np.number])
X_fc_r = features_fc_r

X_fc_r_scaled = ss.fit_transform(X_fc_r)
X_fc_r_scaled[:,:] *= -1

pca_fc_r = PCA().fit_transform(X_fc_r_scaled)
model_fc_r = pd.DataFrame(data = pca_fc_r, columns = X_fc_r.columns)

km_fc_r = KMeans(n_clusters = 2, 
               random_state = 1)

km_fc_r.fit(model_fc_r)
label_fc_r = km_fc_r.fit_predict(model_fc_r)

print('Number of iterations:', km_fc_r.n_iter_)
print('Number of features:', km_fc_r.n_features_in_)
print('Number of clusters:', km_fc_r.n_clusters)
print('Inertia:', km_fc_r.inertia_, '\n')
print("Predicted clusters to points: ", label_fc_r[:10])

Number of iterations: 17
Number of features: 8
Number of clusters: 2
Inertia: 213795.27690716175 

Predicted clusters to points:  [1 1 0 1 0 0 0 0 0 0]


### LHP

In [10]:
features_fc_l = fc_l.select_dtypes([np.number])
X_fc_l = features_fc_l

X_fc_l_scaled = ss.fit_transform(X_fc_l)
X_fc_l_scaled[:,:] *= -1

pca_fc_l = PCA().fit_transform(X_fc_l_scaled)
model_fc_l = pd.DataFrame(data = pca_fc_l, columns = X_fc_l.columns)

km_fc_l = KMeans(n_clusters = 2, 
               random_state = 1)

km_fc_l.fit(model_fc_l)
label_fc_l = km_fc_r.fit_predict(model_fc_l)

print('Number of iterations:', km_fc_l.n_iter_)
print('Number of features:', km_fc_l.n_features_in_)
print('Number of clusters:', km_fc_l.n_clusters)
print('Inertia:', km_fc_l.inertia_, '\n')
print("Predicted clusters to points: ", label_fc_l[:10])

Number of iterations: 21
Number of features: 8
Number of clusters: 2
Inertia: 107914.88531818759 

Predicted clusters to points:  [1 1 1 1 1 1 1 1 1 1]


### Cluster Labels - RHP Cutter

In [11]:
X_fc_r['label'] = label_fc_r
model_fc_r['label'] = label_fc_r

print(X_fc_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 87.697869    91.242683
spin_rate          2456.643686  2387.689284
pfx_-x               -4.753178    -1.179551
pfx_z                 5.081799    10.810929
release_extension     6.215025     6.384406
release_pos_x        -1.612614    -1.900926
release_pos_z         6.012866     5.866886
rv                    0.004374     0.002366


### Cluster Labels - LHP Cutter

In [12]:
X_fc_l['label'] = label_fc_l
model_fc_l['label'] = label_fc_l

print(X_fc_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 86.135976    87.144008
spin_rate          2162.481853  2333.178257
pfx_-x                0.840611     2.238806
pfx_z                 7.958930     7.933768
release_extension     6.453550     6.019890
release_pos_x         2.785064     1.696875
release_pos_z         5.502318     6.031029
rv                    0.008360     0.001597


## Sinker

### RHP

In [13]:
features_si_r = si_r.select_dtypes([np.number])
X_si_r = features_si_r

X_si_r_scaled = ss.fit_transform(X_si_r)
X_si_r_scaled[:,:] *= -1

pca_si_r = PCA().fit_transform(X_si_r_scaled)
model_si_r = pd.DataFrame(data = pca_si_r, columns = X_si_r.columns)

km_si_r = KMeans(n_clusters = 2, 
                 random_state = 1)

km_si_r.fit(model_si_r)
label_si_r = km_si_r.fit_predict(model_si_r)

print('Number of iterations:', km_si_r.n_iter_)
print('Number of features:', km_si_r.n_features_in_)
print('Number of clusters:', km_si_r.n_clusters)
print('Inertia:', km_si_r.inertia_, '\n')
print("Predicted clusters to points: ", label_si_r[:10])

Number of iterations: 11
Number of features: 8
Number of clusters: 2
Inertia: 519839.2521872819 

Predicted clusters to points:  [0 1 0 0 0 1 1 0 0 1]


### LHP

In [14]:
features_si_l = si_l.select_dtypes([np.number])
X_si_l = features_si_l

X_si_l_scaled = ss.fit_transform(X_si_l)
X_si_l_scaled[:,:] *= -1

pca_si_l = PCA().fit_transform(X_si_l_scaled)
model_si_l = pd.DataFrame(data = pca_si_l, columns = X_si_l.columns)

km_si_l = KMeans(n_clusters = 2, 
                 random_state = 1)

km_si_l.fit(model_si_l)
label_si_l = km_si_l.fit_predict(model_si_l)

print('Number of iterations:', km_si_l.n_iter_)
print('Number of features:', km_si_l.n_features_in_)
print('Number of clusters:', km_si_l.n_clusters)
print('Inertia:', km_si_l.inertia_, '\n')
print("Predicted clusters to points: ", label_si_l[:10])

Number of iterations: 6
Number of features: 8
Number of clusters: 2
Inertia: 225389.5500483192 

Predicted clusters to points:  [0 0 1 1 1 1 1 1 1 1]


### Cluster Labels - RHP Sinker

In [15]:
X_si_r['label'] = label_si_r
model_si_r['label'] = label_si_r

print(X_si_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 94.370120    91.839217
spin_rate          2194.666166  2062.438001
pfx_-x               14.001226    16.015725
pfx_z                11.100466     6.089766
release_extension     6.330377     6.292280
release_pos_x        -1.588890    -2.355398
release_pos_z         5.902023     5.395513
rv                    0.004125     0.016031


### Cluster Labels - LHP Sinker

In [16]:
X_si_l['label'] = label_si_l
model_si_l['label'] = label_si_l

print(X_si_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 91.320315    92.506468
spin_rate          2011.789898  2108.916866
pfx_-x              -16.344466   -14.330524
pfx_z                 6.220937    10.864621
release_extension     6.423682     6.221591
release_pos_x         2.869173     1.755551
release_pos_z         5.313150     6.010566
rv                    0.003425     0.009921


## Slider

### RHP

In [17]:
features_sl_r = sl_r.select_dtypes([np.number])
X_sl_r = features_sl_r

X_sl_r_scaled = ss.fit_transform(X_sl_r)
X_sl_r_scaled[:,:] *= -1

pca_sl_r = PCA().fit_transform(X_sl_r_scaled)
model_sl_r = pd.DataFrame(data = pca_sl_r, columns = X_sl_r.columns)

km_sl_r = KMeans(n_clusters = 2, random_state = 1)

km_sl_r.fit(model_sl_r)
label_sl_r = km_sl_r.fit_predict(model_sl_r)

print('Number of iterations:', km_sl_r.n_iter_)
print('Number of features:', km_sl_r.n_features_in_)
print('Number of clusters:', km_sl_r.n_clusters)
print('Inertia:', km_sl_r.inertia_, '\n')
print("Predicted clusters to points: ", label_sl_r[:10])

Number of iterations: 17
Number of features: 8
Number of clusters: 2
Inertia: 672045.571424603 

Predicted clusters to points:  [0 0 0 0 1 0 0 0 1 1]


### LHP

In [18]:
features_sl_l = sl_l.select_dtypes([np.number])
X_sl_l = features_sl_l

X_sl_l_scaled = ss.fit_transform(X_sl_l)
X_sl_l_scaled[:,:] *= -1

pca_sl_l = PCA().fit_transform(X_sl_l_scaled)
model_sl_l = pd.DataFrame(data = pca_sl_l, columns = X_sl_l.columns)

km_sl_l = KMeans(n_clusters = 2, random_state = 1)

km_sl_l.fit(model_sl_l)
label_sl_l = km_sl_l.fit_predict(model_sl_l)

print('Number of iterations:', km_sl_l.n_iter_)
print('Number of features:', km_sl_l.n_features_in_)
print('Number of clusters:', km_sl_l.n_clusters)
print('Inertia:', km_sl_l.inertia_, '\n')
print("Predicted clusters to points: ", label_sl_l[:10])

Number of iterations: 12
Number of features: 8
Number of clusters: 2
Inertia: 229764.19415032002 

Predicted clusters to points:  [0 0 0 1 1 1 1 1 1 1]


### Cluster Labels - RHP Slider

In [19]:
X_sl_r['label'] = label_sl_r
model_sl_r['label'] = label_sl_r

print(X_sl_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 86.241145    81.962939
spin_rate          2375.703255  2575.429667
pfx_-x               -3.602967   -12.288642
pfx_z                 2.397120     0.461306
release_extension     6.232297     6.326041
release_pos_x        -1.771397    -2.316838
release_pos_z         5.943565     5.380770
rv                   -0.005570    -0.011784


### Cluster Labels - LHP Slider

In [20]:
X_sl_l['label'] = label_sl_l
model_sl_l['label'] = label_sl_l

print(X_sl_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 85.353751    82.066202
spin_rate          2261.490288  2482.323906
pfx_-x                3.273650     9.014443
pfx_z                 3.532761    -1.268162
release_extension     6.349774     6.116351
release_pos_x         1.887685     2.135162
release_pos_z         6.056659     5.641745
rv                   -0.009501    -0.006512


## Curveball

### RHP

In [21]:
features_cu_r = cu_r.select_dtypes([np.number])
X_cu_r = features_cu_r

X_cu_r_scaled = ss.fit_transform(X_cu_r)
X_cu_r_scaled[:,:] *= -1

pca_cu_r = PCA().fit_transform(X_cu_r_scaled)
model_cu_r = pd.DataFrame(data = pca_cu_r, columns = X_cu_r.columns)

km_cu_r = KMeans(n_clusters = 2, random_state = 1)

km_cu_r.fit(model_cu_r)
label_cu_r = km_cu_r.fit_predict(model_cu_r)

print('Number of iterations:', km_cu_r.n_iter_)
print('Number of features:', km_cu_r.n_features_in_)
print('Number of clusters:', km_cu_r.n_clusters)
print('Inertia:', km_cu_r.inertia_, '\n')
print("Predicted clusters to points: ", label_cu_r[:10])

Number of iterations: 11
Number of features: 8
Number of clusters: 2
Inertia: 273737.521374984 

Predicted clusters to points:  [1 1 1 0 0 1 1 1 1 1]


### LHP

In [22]:
features_cu_l = cu_l.select_dtypes([np.number])
X_cu_l = features_cu_l

X_cu_l_scaled = ss.fit_transform(X_cu_l)
X_cu_l_scaled[:,:] *= -1

pca_cu_l = PCA().fit_transform(X_cu_l_scaled)
model_cu_l = pd.DataFrame(data = pca_cu_l, columns = X_cu_l.columns)

km_cu_l = KMeans(n_clusters = 2, random_state = 1)

km_cu_l.fit(model_cu_l)
label_cu_l = km_cu_l.fit_predict(model_cu_l)

print('Number of iterations:', km_cu_l.n_iter_)
print('Number of features:', km_cu_l.n_features_in_)
print('Number of clusters:', km_cu_l.n_clusters)
print('Inertia:', km_cu_l.inertia_, '\n')
print("Predicted clusters to points: ", label_cu_l[:10])

Number of iterations: 13
Number of features: 8
Number of clusters: 2
Inertia: 123054.08084347605 

Predicted clusters to points:  [0 1 0 0 0 0 0 0 0 0]


### Cluster Labels - RHP Curveball

In [23]:
X_cu_r['label'] = label_cu_r
model_cu_r['label'] = label_cu_r

print(X_cu_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 78.726190    80.167136
spin_rate          2504.591520  2603.805185
pfx_-x               -8.188820   -12.469659
pfx_z               -11.470115    -5.109886
release_extension     6.251365     6.236496
release_pos_x        -1.389616    -2.176828
release_pos_z         6.142124     5.621763
rv                   -0.003701    -0.014405


### Cluster Labels - LHP Curveball

In [24]:
X_cu_l['label'] = label_cu_l
model_cu_l['label'] = label_cu_l

print(X_cu_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 78.646391    77.018745
spin_rate          2237.941756  2600.724237
pfx_-x                5.419945    10.857112
pfx_z                -2.831522   -11.268908
release_extension     6.184965     6.080088
release_pos_x         2.453813     1.482822
release_pos_z         5.893891     6.079133
rv                   -0.003242    -0.012251


## Changeup

### RHP

In [25]:
features_ch_r = ch_r.select_dtypes([np.number])
X_ch_r = features_ch_r

X_ch_r_scaled = ss.fit_transform(X_ch_r)
X_ch_r_scaled[:,:] *= -1

pca_ch_r = PCA().fit_transform(X_ch_r)
model_ch_r = pd.DataFrame(data = pca_ch_r, columns = X_ch_r.columns)

km_ch_r = KMeans(n_clusters = 2, random_state = 1)

km_ch_r.fit(model_ch_r)
label_ch_r = km_ch_r.fit_predict(model_ch_r)

print('Number of iterations:', km_ch_r.n_iter_)
print('Number of features:', km_ch_r.n_features_in_)
print('Number of clusters:', km_ch_r.n_clusters)
print('Inertia:', km_ch_r.inertia_, '\n')
print("Predicted clusters to points: ", label_ch_r[:10])

Number of iterations: 9
Number of features: 8
Number of clusters: 2
Inertia: 1769855897.73446 

Predicted clusters to points:  [1 1 1 1 1 0 1 0 0 0]


### LHP

In [26]:
features_ch_l = ch_l.select_dtypes([np.number])
X_ch_l = features_ch_l

X_ch_l_scaled = ss.fit_transform(X_ch_l)
X_ch_l_scaled[:,:] *= -1

pca_ch_l = PCA().fit_transform(X_ch_l)
model_ch_l = pd.DataFrame(data = pca_ch_l, columns = X_ch_l.columns)

km_ch_l = KMeans(n_clusters = 2, random_state = 1)

km_ch_l.fit(model_ch_l)
label_ch_l = km_ch_l.fit_predict(model_ch_l)

print('Number of iterations:', km_ch_l.n_iter_)
print('Number of features:', km_ch_l.n_features_in_)
print('Number of clusters:', km_ch_l.n_clusters)
print('Inertia:', km_ch_l.inertia_, '\n')
print("Predicted clusters to points: ", label_ch_l[:10])

Number of iterations: 12
Number of features: 8
Number of clusters: 2
Inertia: 900210333.1016059 

Predicted clusters to points:  [1 1 1 1 1 1 1 1 1 1]


### Cluster Labels - RHP Changeup

In [27]:
X_ch_r['label'] = label_ch_r
model_ch_r['label'] = label_ch_r

print(X_ch_r.groupby(by = 'label').mean().T)

label                        0            1
velo                 85.956140    85.248104
spin_rate          2051.267830  1564.045441
pfx_-x               15.162111    12.986649
pfx_z                 6.325095     6.256120
release_extension     6.248504     6.404805
release_pos_x        -2.030577    -1.881358
release_pos_z         5.692579     5.821656
rv                   -0.004496    -0.002545


### Cluster Labels - LHP Changeup

In [28]:
X_ch_l['label'] = label_ch_l
model_ch_l['label'] = label_ch_l

print(X_ch_l.groupby(by = 'label').mean().T)

label                        0            1
velo                 83.838162    83.424920
spin_rate          2049.361838  1558.404507
pfx_-x              -15.570225   -12.955998
pfx_z                 8.828259     7.150670
release_extension     6.185523     6.279761
release_pos_x         1.966345     2.082765
release_pos_z         5.795536     5.867756
rv                   -0.002954    -0.002318
