In [1]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans

In [2]:
training_data = pd.read_csv("training_data.csv")
training_data = training_data.dropna(subset = ['exit_velocity','distance','hang_time','direction','play_result'])
batters = training_data.groupby("batter_name", as_index=False)
batter_dict = dict(iter(batters))
Seymour_Bobby = batter_dict['Kjerstad, Heston']  
Seymour_Bobby = Seymour_Bobby.reset_index(drop=True)
data = pd.DataFrame(Seymour_Bobby[['distance','direction']])
data = data.reset_index(drop=True)


In [3]:
data_scaled = preprocessing.scale(data)

In [4]:
print(np.array([110,34]))

[110  34]


In [5]:
class K_Means:
    def __init__(self, k,tol=0.0,max_iter = 300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        
    def fit(self,data):
        self.centroids = {}
        for i in range(self.k):
            self.centroids[i] = data[i]
            print("Centroid " + str(i) + ": " + str(self.centroids[i]))
            
        for i in range(self.max_iter):
            print("Iteration: " + str(i))
            self.classifications = {}
            
            for i in range(self.k):
                self.classifications[i] = []
                
            for featureset in data:
                distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)
                
            prev_centroids = dict(self.centroids)
            
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis = 0)
            
            optimized = True
            
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid - original_centroid)/ original_centroid * 100.0) > self.tol:
                    optimized = False
                
            if optimized:
                break
                
    def predict(self,data):
        distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [6]:
kmeans = KMeans(n_clusters = 7)
groups = kmeans.fit_predict(data_scaled)
Seymour_Bobby['scikit_cluster'] = groups

In [7]:
clf = K_Means(k=7)

In [8]:
clf.fit(data_scaled)

Centroid 0: [1.64050627 0.1241124 ]
Centroid 1: [-1.00757308 -0.17920737]
Centroid 2: [-1.10035478 -0.86082654]
Centroid 3: [-0.81327301 -1.95910211]
Centroid 4: [0.49975243 1.15416185]
Centroid 5: [-1.05216134 -1.47856381]
Centroid 6: [-1.24809709  1.74270468]
Iteration: 0


In [9]:
len(data)

54

In [10]:
len(Seymour_Bobby)

54

In [11]:
data_scaled[0]

array([1.64050627, 0.1241124 ])

In [12]:
for i, bip in data.iterrows():
    print(i)
    Seymour_Bobby.at[i,'custom_cluster'] = clf.predict(data_scaled[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


In [13]:
Seymour_Bobby['custom_cluster'].value_counts()

0.0    14
4.0    10
6.0     8
1.0     8
3.0     6
2.0     6
5.0     2
Name: custom_cluster, dtype: int64

In [14]:
cluster_0 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 0]
cluster_1 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 1]
cluster_2 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 2]
cluster_3 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 3]
cluster_4 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 4]
cluster_5 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 5]
cluster_6 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 6]


custom_clusters = [0,1,2,3,4,5,6]

In [15]:
mean_distances = []
mean_distances.append(cluster_0['distance'].mean())
mean_distances.append(cluster_1['distance'].mean())
mean_distances.append(cluster_2['distance'].mean())
mean_distances.append(cluster_3['distance'].mean())
mean_distances.append(cluster_4['distance'].mean())
mean_distances.append(cluster_5['distance'].mean())
mean_distances.append(cluster_6['distance'].mean())

In [16]:
mean_directions = []
mean_directions.append(cluster_0['direction'].mean())
mean_directions.append(cluster_1['direction'].mean())
mean_directions.append(cluster_2['direction'].mean())
mean_directions.append(cluster_3['direction'].mean())
mean_directions.append(cluster_4['direction'].mean())
mean_directions.append(cluster_5['direction'].mean())
mean_directions.append(cluster_6['direction'].mean())

In [17]:
custom_cluster_means = pd.DataFrame()
custom_cluster_means['custom_cluster'] = custom_clusters
custom_cluster_means['distance'] = mean_distances
custom_cluster_means['direction'] = mean_directions

In [18]:
custom_cluster_means

Unnamed: 0,custom_cluster,distance,direction
0,0,336.47176,7.340606
1,1,71.3912,2.889623
2,2,66.118825,-8.921323
3,3,120.004458,-31.186638
4,4,188.542687,21.355103
5,5,32.167263,-23.984942
6,6,40.846349,33.983252


In [19]:
custom_cluster_means.to_csv('custom_cluster_means.csv')

In [20]:
cluster_0 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 0]
cluster_1 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 1]
cluster_2 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 2]
cluster_3 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 3]
cluster_4 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 4]
cluster_5 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 5]
cluster_6 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 6]


scikit_clusters = ['cluster_0','cluster_1','cluster_2','cluster_3','cluster_4','cluster_5','cluster_6']

In [21]:
mean_distances = []
mean_distances.append(cluster_0['distance'].mean())
mean_distances.append(cluster_1['distance'].mean())
mean_distances.append(cluster_2['distance'].mean())
mean_distances.append(cluster_3['distance'].mean())
mean_distances.append(cluster_4['distance'].mean())
mean_distances.append(cluster_5['distance'].mean())
mean_distances.append(cluster_6['distance'].mean())

In [22]:
mean_directions = []
mean_directions.append(cluster_0['direction'].mean())
mean_directions.append(cluster_1['direction'].mean())
mean_directions.append(cluster_2['direction'].mean())
mean_directions.append(cluster_3['direction'].mean())
mean_directions.append(cluster_4['direction'].mean())
mean_directions.append(cluster_5['direction'].mean())
mean_directions.append(cluster_6['direction'].mean())


In [23]:
scikit_cluster_means = pd.DataFrame()
scikit_cluster_means['scikit_cluster'] = scikit_clusters
scikit_cluster_means['distance'] = mean_distances
scikit_cluster_means['direction'] = mean_directions

In [24]:
scikit_cluster_means

Unnamed: 0,scikit_cluster,distance,direction
0,cluster_0,64.548974,-30.338242
1,cluster_1,167.327592,21.831521
2,cluster_2,365.648845,-5.148668
3,cluster_3,69.131611,-2.172211
4,cluster_4,316.234616,13.521834
5,cluster_5,198.533714,-26.530129
6,cluster_6,40.846349,33.983252


In [25]:
data.describe()

Unnamed: 0,distance,direction
count,54.0,54.0
mean,160.648262,5.975646
std,123.258112,20.453573
min,8.241255,-33.722235
25%,43.54357,-8.075337
50%,113.362427,5.780205
75%,289.91234,20.273462
max,396.720137,41.288599


In [26]:
scikit_cluster_means.to_csv('cluster_means.csv')

In [27]:
Seymour_Bobby.to_csv('custom_clusters.csv')

In [28]:
Seymour_Bobby.head()

Unnamed: 0.1,Unnamed: 0,batter_name,pitcher_handedness,batter_handedness,inning,outs,strikes,velocity,vertical_release_angle,horizontal_release_angle,...,pfxz,vx0,vy0,vz0,ax,ay,play_result,traditional_cluster,scikit_cluster,custom_cluster
0,404,"Kjerstad, Heston",1,1,2.0,1.0,0.0,87.954382,-2.761464,-1.827707,...,11.83155,3.858904,-127.821833,-6.68629,-6.669145,25.611659,0,0.0,4,0.0
1,421,"Kjerstad, Heston",1,1,5.0,1.0,1.0,80.500285,0.302952,-1.683694,...,5.736682,3.323834,-117.159406,-0.416743,-3.361498,19.803671,1,4.0,3,1.0
2,444,"Kjerstad, Heston",0,0,6.0,0.0,2.0,86.031363,-1.403993,3.245062,...,7.101069,-6.891405,-125.161931,-3.754086,7.84718,24.489969,0,6.0,3,2.0
3,750,"Kjerstad, Heston",1,1,6.0,1.0,0.0,81.071524,-0.827871,-5.232066,...,6.681442,10.196046,-117.34994,-2.748185,-14.35788,23.283572,1,5.0,0,3.0
4,863,"Kjerstad, Heston",0,0,4.0,1.0,1.0,73.18936,1.126109,0.061304,...,-5.587086,-0.385421,-106.412296,0.144903,-5.315252,17.900968,1,9.0,1,4.0


In [29]:
Seymour_Bobby.to_csv('clusters.csv')

In [30]:
data

Unnamed: 0,distance,direction
0,360.972942,8.490573
1,37.612001,2.344312
2,26.282289,-11.467544
3,61.338274,-33.722235
4,221.67378,29.362777
5,32.167263,-23.984942
6,8.241255,41.288599
7,83.510432,-6.793961
8,334.577554,-13.157194
9,35.233218,0.24818
