In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlb-pitch-data-20152018/pitches.csv
/kaggle/input/mlb-pitch-data-20152018/atbats.csv
/kaggle/input/mlb-pitch-data-20152018/games.csv
/kaggle/input/mlb-pitch-data-20152018/2019_games.csv
/kaggle/input/mlb-pitch-data-20152018/2019_atbats.csv
/kaggle/input/mlb-pitch-data-20152018/2019_pitches.csv
/kaggle/input/mlb-pitch-data-20152018/player_names.csv
/kaggle/input/mlb-pitch-data-20152018/ejections.csv


In [5]:
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix as cm
import pickle
import time

In [6]:
data_raw = pd.read_csv('/kaggle/input/mlb-pitch-data-20152018/pitches.csv')

In [7]:
# Print out size, shape, and column names
print("size: " + str(data_raw.size))
print("shape: " + str(data_raw.shape))
print("columns: " + str(data_raw.columns))

size: 114686160
shape: (2867154, 40)
columns: Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0',
       'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone', 'code', 'type', 'pitch_type',
       'event_num', 'b_score', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b'],
      dtype='object')


In [8]:
df = data_raw
# drop everything except swinging strikes and foul balls
df = df[df.code.isin(['S', 'F'])]
# Convert S/F to 1 and 0 to measure ROC AUC
sf = {'S': 1, 'F': 0}
df.code = [sf[item] for item in df.code]
# only 2-strike counts
df = df[df.s_count == 2]
# dropping useless columns and rows with null values
df = df.drop(["zone", "type_confidence", "pitch_type", "ab_id", "event_num"], axis = 1)
df = df.drop(["y0", "type", "b_score", "outs", "pitch_num", "b_count", "s_count", "on_1b", "on_2b", "on_3b"], axis = 1)
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [9]:
print(df.shape)
df.columns

(283316, 25)


Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'z0', 'pfx_x', 'pfx_z',
       'nasty', 'code'],
      dtype='object')

In [10]:
# balance out the cases
strikes = df[df.code == 1]
fouls = df[df.code == 0]
fouls = fouls[0:len(strikes)]
tojoin = [strikes, fouls]
df = pd.concat(tojoin)
print(strikes.shape)
df.shape

(92995, 25)


(185990, 25)

In [11]:
import sklearn.model_selection as tts
features_train, features_test, labels_train, labels_test = tts.train_test_split(df.transpose()[:24].transpose(), df.code, test_size = 0.3, random_state = 666)
# IMPORTANT: DO NOT TOUCH VAL UNTIL VALIDATION PHASE!!
features_test, features_val, labels_test, labels_val = tts.train_test_split(features_test, labels_test, test_size = 0.3, random_state = 420)

In [12]:
# Scaling data helps with MLPClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(features_train)
features_train_scaled = scaler.transform(features_train)
features_test_scaled = scaler.transform(features_test)
features_val_scaled = scaler.transform(features_val)

## MLP Regressor

In [13]:
st = time.time()
mlp = MLPClassifier(random_state=1, max_iter=2000).fit(features_train_scaled, labels_train)
# fig, axes = plt.subplots(3, 2, figsize=(10,15))
# cv = ShuffleSplit(n_splits=2, test_size=0.3, random_state=1)
# plot_learning_curve(mlp, "MLP Learning curves", features_train_scaled, labels_train, axes=axes[:,0], ylim=(0.7,1.01),cv=cv,n_jobs=4)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 74.80042219161987 seconds


In [14]:
print(mlp.score(features_val_scaled, labels_val))
print(roc_auc_score(labels_val, mlp.predict_proba(features_val_scaled)[:,1]))
cm(labels_val, mlp.predict(features_val_scaled))

0.7816009557945042
0.8510766346264913


array([[7271, 1143],
       [2513, 5813]])

## Random Forest

In [15]:
st = time.time()
rf = RandomForestClassifier().fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 61.02741241455078 seconds


In [16]:
print(rf.score(features_val, labels_val))
print(roc_auc_score(labels_val, mlp.predict_proba(features_val)[:,1]))
cm(labels_val, rf.predict(features_val))

0.7172043010752688
0.5831669901364877


array([[6411, 2003],
       [2731, 5595]])

## Gradient Boosting

In [17]:
st = time.time()
gb = GradientBoostingClassifier(loss='exponential', n_estimators = 1000).fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 775.5228276252747 seconds


In [20]:
print(gb.score(features_val, labels_val))
print(roc_auc_score(labels_val, mlp.predict_proba(features_val)[:,1]))
cm(labels_val, rf.predict(features_val))

0.7831541218637993
0.5831669901364877


array([[6411, 2003],
       [2731, 5595]])

## Ada Boost 

In [21]:
st = time.time()
ab = AdaBoostClassifier(n_estimators=1000).fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 303.29627203941345 seconds


In [25]:
print(ab.score(features_val, labels_val))
print(roc_auc_score(labels_val, ab.predict_proba(features_val)[:,1]))
cm(labels_val, ab.predict(features_val))

0.7543608124253286
0.8222910799012044


array([[6769, 1645],
       [2467, 5859]])

## Voting Classfier

In [28]:
st = time.time()
vc = VotingClassifier(estimators=[('gbs', gb), ('abs', ab), ('rfs', rf), ('mlps', mlp)], voting = 'soft').fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 1336.859546661377 seconds


In [46]:
print(vc.score(features_val, labels_val))
predictions = vc.predict(features_val)
print("ROC-AUC: ",metrics.roc_auc_score(labels_val, predictions))
cm(labels_val, vc.predict(features_val))

0.7647550776583034
ROC-AUC:  0.7639735279858255


array([[7679,  735],
       [3203, 5123]])

## Naive Bayes

In [29]:
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
px_encoded = le.fit_transform(df.px)
pz_encoded= le.fit_transform(df.pz)
start_encoded=le.fit_transform(df.start_speed)
end_encoded=le.fit_transform(df.end_speed)
spin_encoded=le.fit_transform(df.spin_rate)
spin_dir_encoded=le.fit_transform(df.spin_dir)
break_encoded=le.fit_transform(df.break_angle)
break_length_encoded=le.fit_transform(df.break_length)
break_y_encoded=le.fit_transform(df.break_y)
ax_encoded=le.fit_transform(df.ax)
ay_encoded=le.fit_transform(df.ay)
az_encoded=le.fit_transform(df.az)
sz_bot_encoded=le.fit_transform(df.sz_bot)
sz_top_encoded=le.fit_transform(df.sz_top)
vx0_encoded=le.fit_transform(df.vx0)
vy0_encoded=le.fit_transform(df.vy0)
vz0_encoded=le.fit_transform(df.vz0)
x_encoded=le.fit_transform(df.x)
x0_encoded=le.fit_transform(df.x0)
y_encoded=le.fit_transform(df.y)
z0_encoded=le.fit_transform(df.z0)
pfx_x_encoded=le.fit_transform(df.pfx_x)
pfx_z_encoded=le.fit_transform(df.pfx_z)
nasty_encoded = le.fit_transform(df.nasty)

features = np.array(list(zip(px_encoded,pz_encoded,start_encoded,end_encoded, spin_encoded,spin_dir_encoded,break_encoded,
              break_length_encoded,break_y_encoded,ax_encoded,ay_encoded,az_encoded,sz_bot_encoded,
              sz_top_encoded,vx0_encoded,vz0_encoded,x_encoded,x0_encoded,y_encoded, z0_encoded,
              pfx_x_encoded,pfx_z_encoded, nasty_encoded)))
code_encoded = le.fit_transform(df.code) #S = 1, F = 0

In [37]:
features_train_2, features_test_2, label_train_2,label_test_2 = tts.train_test_split(features, code_encoded, test_size = 0.1, random_state = 69)
features_test_nb, features_val_nb, labels_test_nb, labels_val_nb = tts.train_test_split(features, code_encoded, test_size = 0.3, random_state = 420)

In [38]:
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(features_train_2, label_train_2)

GaussianNB()

In [39]:
predictions = model.predict(features_val_nb)

from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(labels_val_nb, predictions))
print("ROC-AUC: ",metrics.roc_auc_score(labels_val_nb, predictions))
cm(labels_val_nb,predictions)

Accuracy: 0.6232593150169364
ROC-AUC:  0.6231502692911256


array([[18303,  9695],
       [11326, 16473]])

## Decision Trees

In [40]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0) # max_depth = 15
tree = clf.fit(features_train, labels_train)

In [45]:
print(tree.get_depth())
print(tree.score(features_val, labels_val))
predictions = tree.predict(features_val)
print("ROC-AUC: ",metrics.roc_auc_score(labels_val, predictions))
cm(labels_val,predictions)

46
0.6399641577060932
ROC-AUC:  0.6399890662994273


array([[5345, 3069],
       [2958, 5368]])