In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

In [5]:
data_raw = pd.read_csv('pitches.csv')
# Print out size, shape, and column names
print("size: " + str(data_raw.size))
print("shape: " + str(data_raw.shape))
print("columns: " + str(data_raw.columns))

size: 114686160
shape: (2867154, 40)
columns: Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0',
       'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone', 'code', 'type', 'pitch_type',
       'event_num', 'b_score', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b'],
      dtype='object')


In [28]:
df = data_raw
# dropping useless columns and rows with null values
df = df.drop(["nasty", "zone", "type_confidence", "pitch_type", "ab_id", "event_num"], axis = 1)
df = df.dropna()
# drop everything except swinging strikes and foul balls
df = df[df.code.isin(['S', 'F'])]
# only 2-strike counts
df = df[df.s_count == 2]
df = df[df.b_count == 3]
print(df.shape)

(55282, 34)


In [34]:
df = df.drop(["y0", "type", "b_score", "outs", "pitch_num", "b_count", "s_count", "on_1b", "on_2b", "on_3b"], axis = 1)

In [35]:
print(df.shape)
df.columns

(55282, 24)


Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'z0', 'pfx_x', 'pfx_z',
       'code'],
      dtype='object')

In [39]:
# balance out the cases
strikes = df[df.code == 'S']
fouls = df[df.code == 'F']
fouls = fouls[0:len(strikes)]
tojoin = [strikes, fouls]
df = pd.concat(tojoin)
print(strikes.shape)
df.shape

(15199, 24)


(30398, 24)

In [7]:
import sklearn.model_selection as tts
features_train, features_test, labels_train, labels_test = tts.train_test_split(df.transpose()[:23].transpose(), df.code_num, test_size = 0.3, random_state = 69)

In [9]:
print(features_train.shape)
print(features_test.shape)
print(labels_train.shape)
print(labels_test.shape)

(1262415, 23)
(541035, 23)
(1262415,)
(541035,)


In [8]:
# IMPORTANT: DO NOT TOUCH VAL UNTIL VALIDATION PHASE!!
features_test, features_val, labels_test, labels_val = tts.train_test_split(features_test, labels_test, test_size = 0.3, random_state = 420)

In [11]:
df.code_num.shape

(1803450,)

In [9]:
from sklearn.neural_network import MLPRegressor
fear = MLPRegressor(random_state=1, max_iter=100).fit(features_train, labels_train)



In [15]:
fear.score(features_test, labels_test)

-4.716974192786581

In [10]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=45)
rf.fit(features_train, labels_train)

RandomForestRegressor(n_estimators=45)

In [12]:
rf.score(features_test, labels_test)

0.08832690218085981

In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

In [21]:
degree = 5
polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
polyreg.fit(features_train, labels_train)