In [1]:
import umap
import umap.plot
from bokeh.plotting import show, output_notebook
from sklearn import preprocessing
import hdbscan

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [4]:
sc = SparkSession.builder.getOrCreate()

# Data preparation using pyspark

The raw data we are going to use is MLB's pitch-by-pitch data scraped from the MLB website <a href="https://baseballsavant.mlb.com/">Savant</a>. The csv data in this repository includes all 3 million pitches from 2014 to 2019. The csv documentation could be found here: https://baseballsavant.mlb.com/csv-docs

In [5]:
baseball = sc.read.csv('./baseball_savant.csv', inferSchema = True, header = True)

In [6]:
baseball = baseball.withColumn('pitcher_team',F.when(baseball.inning_topbot == 'Bot', baseball.away_team).otherwise(baseball.home_team))
baseball = baseball.withColumn('batter_team',F.when(baseball.inning_topbot == 'Top', baseball.away_team).otherwise(baseball.home_team))
baseball = baseball.withColumn('game_date',baseball.game_date.cast(DateType()))
baseball = baseball.withColumn('game_year',F.year(baseball.game_date))

window = Window.partitionBy('pitcher','game_year')
baseball = baseball.withColumn('season_total_pitches',F.count('*').over(window))

In [7]:
baseball = baseball.filter(baseball.pitch_type.isNotNull())

In [8]:
baseball.groupby('pitch_type').count().show()

+----------+-------+
|pitch_type|  count|
+----------+-------+
|        FT| 401239|
|        SC|    113|
|        SL| 583237|
|        FC| 202561|
|        EP|    867|
|        FF|1296645|
|        FS|  54809|
|        PO|    630|
|        KC|  89087|
|        IN|   6390|
|        CH| 378308|
|        CU| 299183|
|        FO|    845|
|        UN|     20|
|        KN|  11453|
|        FA|     10|
|        SI| 304441|
+----------+-------+



In [9]:
# keeping pitches that are more common
valid_pitch_type = ['CH','CU','FS','KC','SL','SI','FF','FC','FT']
baseball = baseball.filter(baseball.pitch_type.isin(valid_pitch_type))

# Create arsenal data on which we wish to run UMAP on

Here we are interested in pitchers that pitched in the 2019 season with more than 200 pitches. The spark code produces the average release speed, horizontal movement, vertical movement of each pitch type for each pitcher.

In [10]:
arsenal = baseball.select('pitch_type','game_year','player_name','pitcher','p_throws','pitcher_team','season_total_pitches','pitcher','pfx_x','pfx_z','release_speed')

In [11]:
arsenal = arsenal.filter('game_year = 2019 and season_total_pitches > 200').groupBy('pitcher','pitch_type','pitcher_team').\
agg(F.first('player_name').alias('player_name'),
    F.first('p_throws').alias('p_throws'),
    F.avg('release_speed').alias('avg_speed'), 
    F.avg('pfx_z').alias('avg_z'),
    F.avg('pfx_x').alias('avg_x'),
    F.count('*').alias('count'))

In [12]:
window = Window.partitionBy('pitcher','pitcher_team')
arsenal = arsenal.withColumn('proportion', F.col('count')/F.sum('count').over(window))

In [13]:
# convert from a spark dataframe to a Pandas dataframe
arsenal = arsenal.toPandas()

In [14]:
# data for Masahiro Tanaka
arsenal[arsenal.player_name == 'Masahiro Tanaka']

Unnamed: 0,pitcher,pitch_type,pitcher_team,player_name,p_throws,avg_speed,avg_z,avg_x,count,proportion
2936,547888,SI,NYY,Masahiro Tanaka,R,90.335294,0.725872,-1.235809,136,0.045546
2937,547888,SL,NYY,Masahiro Tanaka,R,83.248861,0.108844,0.548934,1097,0.367381
2938,547888,FC,NYY,Masahiro Tanaka,R,87.46087,0.672999,-0.14172,46,0.015405
2939,547888,FS,NYY,Masahiro Tanaka,R,86.724876,0.428213,-1.064891,808,0.270596
2940,547888,CU,NYY,Masahiro Tanaka,R,76.069231,-0.750867,0.77097,91,0.030476
2941,547888,FF,NYY,Masahiro Tanaka,R,91.439851,1.334656,-0.840529,808,0.270596


In [15]:
# sometimes the MLB classify pitches incorrectly, we want to get rid of those records
arsenal2 = arsenal[arsenal.proportion > 0.01]

In [16]:
# transform the data so that each row now represents a pitcher
df = arsenal2.pivot_table(index = ['pitcher','player_name','p_throws','pitcher_team'], columns = 'pitch_type', values = ['avg_x','avg_z','avg_speed','proportion'])
df.columns = [x[0] + '_' + x[1] for x in df.columns]
df.reset_index(inplace = True)
df.fillna(0,inplace = True)

In [17]:
pd.set_option('display.max_columns', None)

In [18]:
df.sample(5)

Unnamed: 0,pitcher,player_name,p_throws,pitcher_team,avg_speed_CH,avg_speed_CU,avg_speed_FC,avg_speed_FF,avg_speed_FS,avg_speed_FT,avg_speed_KC,avg_speed_SI,avg_speed_SL,avg_x_CH,avg_x_CU,avg_x_FC,avg_x_FF,avg_x_FS,avg_x_FT,avg_x_KC,avg_x_SI,avg_x_SL,avg_z_CH,avg_z_CU,avg_z_FC,avg_z_FF,avg_z_FS,avg_z_FT,avg_z_KC,avg_z_SI,avg_z_SL,proportion_CH,proportion_CU,proportion_FC,proportion_FF,proportion_FS,proportion_FT,proportion_KC,proportion_SI,proportion_SL
388,605143,Ray Black,R,SF,0.0,0.0,0.0,99.097297,0.0,0.0,0.0,0.0,86.606667,0.0,0.0,0.0,-0.510775,0.0,0.0,0.0,0.0,0.96016,0.0,0.0,0.0,1.188236,0.0,0.0,0.0,0.0,-0.151752,0.0,0.0,0.0,0.711538,0.0,0.0,0.0,0.0,0.288462
581,641745,Brad Keller,R,KC,87.765789,0.0,0.0,93.665578,0.0,92.830334,0.0,0.0,84.944653,-0.568381,0.0,0.0,-0.002408,0.0,-0.89922,0.0,0.0,0.59946,0.584045,0.0,0.0,1.184532,0.0,0.807858,0.0,0.0,-0.406648,0.014017,0.0,0.0,0.417927,0.0,0.25415,0.0,0.0,0.313906
472,608032,Carlos Estevez,R,COL,90.481818,0.0,0.0,97.845042,0.0,97.418182,0.0,0.0,87.659644,-0.958993,0.0,0.0,-0.503077,0.0,-0.89306,0.0,0.0,0.487284,0.561836,0.0,0.0,1.229445,0.0,0.982382,0.0,0.0,0.007723,0.035484,0.0,0.0,0.675,0.0,0.017742,0.0,0.0,0.271774
509,621242,Edwin Diaz,R,NYM,0.0,0.0,0.0,97.435836,0.0,0.0,0.0,0.0,89.452342,0.0,0.0,0.0,-1.036372,0.0,0.0,0.0,0.0,0.110859,0.0,0.0,0.0,1.179203,0.0,0.0,0.0,0.0,0.462093,0.0,0.0,0.0,0.658582,0.0,0.0,0.0,0.0,0.338619
282,579328,Yusei Kikuchi,L,SEA,84.486,75.001478,0.0,92.454779,0.0,0.0,0.0,0.0,85.961957,0.955016,-0.573644,0.0,0.477225,0.0,0.0,0.0,0.0,-0.289482,0.746651,-1.18664,0.0,1.305153,0.0,0.0,0.0,0.0,0.26039,0.076075,0.154431,0.0,0.48954,0.0,0.0,0.0,0.0,0.279954


In [19]:
df_umap = df.drop(['pitcher','player_name','p_throws','pitcher_team'],axis = 1)

In [20]:
# Scale each column by its maximum absolute value
cols_to_scale = [col for col in df if col.startswith('avg')]
preprocessor = preprocessing.MaxAbsScaler().fit(df_umap[cols_to_scale])
df_umap[cols_to_scale] = preprocessor.transform(df_umap[cols_to_scale])

In [21]:
# data that are fed into the UMAP algorithm.
df_umap.iloc[190]

avg_speed_CH     0.000000
avg_speed_CU     0.928290
avg_speed_FC     0.939670
avg_speed_FF     0.975593
avg_speed_FS     0.953298
avg_speed_FT     0.000000
avg_speed_KC     0.000000
avg_speed_SI     0.000000
avg_speed_SL     0.918602
avg_x_CH         0.000000
avg_x_CU         0.378997
avg_x_FC         0.061649
avg_x_FF        -0.591736
avg_x_FS        -0.489777
avg_x_FT         0.000000
avg_x_KC         0.000000
avg_x_SI         0.000000
avg_x_SL         0.321928
avg_z_CH         0.000000
avg_z_CU        -0.457214
avg_z_FC         0.558999
avg_z_FF         0.633422
avg_z_FS         0.102651
avg_z_FT         0.000000
avg_z_KC         0.000000
avg_z_SI         0.000000
avg_z_SL         0.035081
proportion_CH    0.000000
proportion_CU    0.174842
proportion_FC    0.225475
proportion_FF    0.432753
proportion_FS    0.135285
proportion_FT    0.000000
proportion_KC    0.000000
proportion_SI    0.000000
proportion_SL    0.031646
Name: 190, dtype: float64

# Apply UMAP on pitcher arsenal

Here we apply the UMAP algorithm and project the data onto a 2 dimensional space. There are two hyper-parameters: n_neighbors, the number of nearest neighbors when constructing the graph in the original space, and min_dist, the minimum distance between points in the low-dimensional embedding space. Increasing n_neighbors would let us preserve more the global structure of the data, while decreasing n_neighbors let us focus on the local structure of the data. The second hyper-parameter min_dist is more of a aesthetic hyper-paramter.

In [22]:
# N_components is the dimension that the data is projected onto
N_COMPONENTS = 2
METRIC = 'cosine'
transformer = umap.UMAP(n_components = N_COMPONENTS, random_state = 0,n_neighbors=30,min_dist=0.0, metric = METRIC).fit(df_umap)
embedding = transformer.transform(df_umap)

# Apply HDBSCAN on the UMAP embeddings

After learning a low-dimensional representation for the pitcher data, we could apply a clustering algorithm on the embeddings to detect pitcher clusters. In particular, I chose HDBSCAN. HDBSCAN is a hierarchical density-based clustering algorithm. Unlike K-Means, HDBSCAN works well with clusters having varying densities and shapes. The main hyper-parameter we would control is the minimum cluster size, which I set to be 15.

In [23]:
# Fit HDBSCAN with minimum cluster size 15
clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
clusterer.fit(embedding)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=15, min_samples=None, p=None,
        prediction_data=False)

In [24]:
# number of clusters that are detected by umap
clusterer.labels_.max()+1

17

In [25]:
embedding_dat = pd.DataFrame(np.column_stack([embedding,clusterer.labels_,df[['player_name','p_throws','pitcher_team']]]),
                             columns = ['x{}'.format(i) for i in range(1,N_COMPONENTS+1)]+['cluster','player_name','p_throws','team'])\
                            .sort_values('cluster')

In [26]:
embedding_dat.sample(10)

Unnamed: 0,x1,x2,cluster,player_name,p_throws,team
30,9.25185,8.94594,2,Pat Neshek,R,PHI
657,4.51528,3.0731,-1,Jacob Webb,R,ATL
625,1.83419,5.32443,16,Darwinzon Hernandez,L,BOS
606,-1.84067,13.3157,7,Jose Rodriguez,R,LAA
673,1.36,0.142716,3,Jose Suarez,L,LAA
99,0.964296,10.0601,11,Adam Ottavino,R,NYY
357,9.01959,9.33002,2,Blake Treinen,R,OAK
153,3.17031,3.31642,-1,Neil Ramirez,R,CLE
485,6.2588,3.62341,14,Chris Stratton,R,LAA
168,-7.29119,1.70421,0,Martin Perez,L,MIN


# Visualization of the embeddings

## Points colored by handedness:
we are able to separate the left-handed and right-handed pitchers pretty well

In [27]:
p = umap.plot.interactive(transformer, hover_data=embedding_dat,labels = embedding_dat.p_throws, point_size=3, theme = 'fire')
output_notebook()
show(p)

## Points colored by cluster membership:

In [28]:
# cluster -1 represents points that are considered noise by the HDBSCAN algorithm 
p = umap.plot.interactive(transformer, hover_data=embedding_dat,labels = embedding_dat.cluster, point_size=3, theme = 'fire')
output_notebook()
show(p)

If one is interested in exploring the characteristics of the clusters, one could find the medoid (or points close to the medoid) of each cluster, then look up for those points in the original data (or apply inverse transform on the embeddings).