In [1]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans

In [2]:
training_data = pd.read_csv("training_data.csv")
training_data = training_data.dropna(subset = ['exit_velocity','distance','hang_time','direction','play_result'])
batters = training_data.groupby("batter_name", as_index=False)
batter_dict = dict(iter(batters))
Seymour_Bobby = batter_dict['Martin, Austin']
Seymour_Bobby = Seymour_Bobby[Seymour_Bobby['traditional_cluster'] != 3] #Drop all to 1st
Seymour_Bobby = Seymour_Bobby.reset_index(drop=True)
data = pd.DataFrame(Seymour_Bobby[['distance','direction']])
data = data.reset_index(drop=True)

In [3]:
data_scaled = preprocessing.scale(data)

In [4]:
print(np.array([110,34]))

[110  34]


In [5]:
class K_Means:
    def __init__(self, k,tol=0.0,max_iter = 300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        
    def fit(self,data):
        self.centroids = {}
#         self.centroids[0] = np.array([-0.18521355 , 1.38671137])
        for i in range(self.k):
            self.centroids[i] = data[i] # Adjust first centroid around 1st Base?
            print("Centroid " + str(i) + ": " + str(self.centroids[i]))
            
        for i in range(self.max_iter):
            print("Iteration: " + str(i))
            self.classifications = {}
            
            for i in range(self.k):
                self.classifications[i] = []
                
            for featureset in data:
                distances = [np.linalg.norm(featureset - self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)
                
            prev_centroids = dict(self.centroids)
            
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis = 0)
            
            optimized = True
            
            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid - original_centroid)/ original_centroid * 100.0) > self.tol:
                    optimized = False
                
            if optimized:
                break
                
    def predict(self,data):
        distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [6]:
kmeans = KMeans(n_clusters = 6)
groups = kmeans.fit_predict(data_scaled)
Seymour_Bobby['scikit_cluster'] = groups

In [7]:
clf = K_Means(k=6)

In [8]:
clf.fit(data_scaled)

Centroid 0: [0.08906273 0.32399033]
Centroid 1: [-1.34854104 -0.11684044]
Centroid 2: [-0.84316482 -1.94939269]
Centroid 3: [1.15293935 2.13197699]
Centroid 4: [-0.91155344 -0.84491642]
Centroid 5: [1.06778475 1.01459266]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5


In [9]:
len(data)

78

In [10]:
len(Seymour_Bobby)

78

In [11]:
data_scaled[0]

array([0.08906273, 0.32399033])

In [12]:
for i, bip in data.iterrows():
    print(i)
    Seymour_Bobby.at[i,'custom_cluster'] = clf.predict(data_scaled[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77


In [13]:
Seymour_Bobby['custom_cluster'].value_counts()

5.0    20
0.0    20
1.0    16
4.0    13
3.0     5
2.0     4
Name: custom_cluster, dtype: int64

In [14]:
cluster_0 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 0]
cluster_1 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 1]
cluster_2 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 2]
cluster_3 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 3]
cluster_4 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 4]
cluster_5 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 5]
# cluster_6 = Seymour_Bobby.loc[Seymour_Bobby['custom_cluster'] == 6]


custom_clusters = [0,1,2,3,4,5,6]

In [15]:
mean_distances = []
mean_distances.append(cluster_0['distance'].mean())
mean_distances.append(cluster_1['distance'].mean())
mean_distances.append(cluster_2['distance'].mean())
mean_distances.append(cluster_3['distance'].mean())
mean_distances.append(cluster_4['distance'].mean())
mean_distances.append(cluster_5['distance'].mean())
mean_distances.append(110)

In [16]:
mean_directions = []
mean_directions.append(cluster_0['direction'].mean())
mean_directions.append(cluster_1['direction'].mean())
mean_directions.append(cluster_2['direction'].mean())
mean_directions.append(cluster_3['direction'].mean())
mean_directions.append(cluster_4['direction'].mean())
mean_directions.append(cluster_5['direction'].mean())
mean_directions.append(34)

In [17]:
custom_cluster_means = pd.DataFrame()
custom_cluster_means['custom_cluster'] = custom_clusters
custom_cluster_means['distance'] = mean_distances
custom_cluster_means['direction'] = mean_directions

In [18]:
custom_cluster_means

Unnamed: 0,custom_cluster,distance,direction
0,0,236.993666,-18.14265
1,1,45.727988,-1.298957
2,2,40.916589,-41.752771
3,3,290.703877,25.509993
4,4,38.45,-22.786669
5,5,314.576176,3.967718
6,6,110.0,34.0


In [19]:
custom_cluster_means.to_csv('custom_cluster_means.csv')

In [20]:
cluster_0 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 0]
cluster_1 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 1]
cluster_2 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 2]
cluster_3 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 3]
cluster_4 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 4]
cluster_5 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 5]
#cluster_6 = Seymour_Bobby.loc[Seymour_Bobby['scikit_cluster'] == 6]


scikit_clusters = [0,1,2,3,4,5]

In [21]:
mean_distances = []
mean_distances.append(cluster_0['distance'].mean())
mean_distances.append(cluster_1['distance'].mean())
mean_distances.append(cluster_2['distance'].mean())
mean_distances.append(cluster_3['distance'].mean())
mean_distances.append(cluster_4['distance'].mean())
mean_distances.append(cluster_5['distance'].mean())
#mean_distances.append(110)

In [22]:
mean_directions = []
mean_directions.append(cluster_0['direction'].mean())
mean_directions.append(cluster_1['direction'].mean())
mean_directions.append(cluster_2['direction'].mean())
mean_directions.append(cluster_3['direction'].mean())
mean_directions.append(cluster_4['direction'].mean())
mean_directions.append(cluster_5['direction'].mean())
#mean_directions.append(34)


In [23]:
scikit_cluster_means = pd.DataFrame()
scikit_cluster_means['scikit_cluster'] = scikit_clusters
scikit_cluster_means['distance'] = mean_distances
scikit_cluster_means['direction'] = mean_directions

In [24]:
scikit_cluster_means

Unnamed: 0,scikit_cluster,distance,direction
0,0,81.222079,4.562978
1,1,214.85051,-20.659016
2,2,292.516229,15.817127
3,3,38.829443,-34.606018
4,4,28.187412,-14.733002
5,5,329.382177,-3.055486


In [25]:
data.describe()

Unnamed: 0,distance,direction
count,78.0,78.0
mean,177.949749,-8.204741
std,128.636518,16.794998
min,5.59371,-43.743795
25%,51.407273,-19.128308
50%,181.151851,-10.154456
75%,311.914524,3.980446
max,383.645893,27.37154


In [26]:
scikit_cluster_means.to_csv('cluster_means.csv')

In [27]:
Seymour_Bobby.to_csv('custom_clusters.csv')

In [28]:
Seymour_Bobby.head()

Unnamed: 0.1,Unnamed: 0,batter_name,pitcher_handedness,batter_handedness,inning,outs,strikes,velocity,vertical_release_angle,horizontal_release_angle,...,pfxz,vx0,vy0,vz0,ax,ay,play_result,traditional_cluster,scikit_cluster,custom_cluster
0,1749,"Martin, Austin",1,1,7.0,0.0,2.0,86.288934,0.627089,-3.519084,...,3.859273,7.269059,-125.230646,0.384513,-13.01817,28.200498,1,0.0,0,0.0
1,1940,"Martin, Austin",1,1,7.0,1.0,2.0,84.332604,-1.394514,-3.51145,...,5.295199,7.254627,-122.717318,-3.71564,-10.883066,23.958047,0,0.0,4,1.0
2,1945,"Martin, Austin",1,1,9.0,2.0,2.0,90.950819,-1.959237,-0.198725,...,10.697284,0.403453,-132.485504,-4.934471,-2.087009,29.397472,0,5.0,3,2.0
3,1955,"Martin, Austin",1,1,1.0,0.0,1.0,84.450564,-0.108523,-1.67108,...,2.706867,3.713712,-122.831265,-1.346401,2.565913,24.635446,0,0.0,2,3.0
4,1957,"Martin, Austin",1,1,3.0,0.0,0.0,95.58668,-1.987383,-2.600693,...,10.051809,5.971916,-138.899409,-5.248074,-12.803702,35.391505,0,0.0,4,4.0


In [29]:
Seymour_Bobby.to_csv('clusters.csv')

In [30]:
data

Unnamed: 0,distance,direction
0,189.332791,-2.798317
1,5.593710,-10.154456
2,70.185474,-40.734239
3,325.306079,27.371540
4,61.444774,-22.303853
...,...,...
73,6.000730,-41.798812
74,281.800418,9.373242
75,328.012851,-0.853940
76,5.593710,-10.154456


In [31]:
Seymour_Bobby = batter_dict['Martin, Austin']
Seymour_Bobby_1B = Seymour_Bobby[Seymour_Bobby['traditional_cluster'] == 3] #Drop all to 1st

In [32]:
Seymour_Bobby_1B.to_csv('Seymour_Bobby_1B.csv')

In [33]:
Seymour_Bobby_1B.head()

Unnamed: 0.1,Unnamed: 0,batter_name,pitcher_handedness,batter_handedness,inning,outs,strikes,velocity,vertical_release_angle,horizontal_release_angle,...,hang_time,pfxx,pfxz,vx0,vy0,vz0,ax,ay,play_result,traditional_cluster
