In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
import pickle
import time

In [2]:
data_raw = pd.read_csv('pitches.csv')

In [4]:
# Print out size, shape, and column names
print("size: " + str(data_raw.size))
print("shape: " + str(data_raw.shape))
print("columns: " + str(data_raw.columns))

size: 114686160
shape: (2867154, 40)
columns: Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0',
       'z0', 'pfx_x', 'pfx_z', 'nasty', 'zone', 'code', 'type', 'pitch_type',
       'event_num', 'b_score', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b'],
      dtype='object')


In [4]:
df = data_raw
# drop everything except swinging strikes and foul balls
df = df[df.code.isin(['S', 'F'])]
# only 2-strike counts
df = df[df.s_count == 2]
# dropping useless columns and rows with null values
df = df.drop(["zone", "type_confidence", "pitch_type", "ab_id", "event_num"], axis = 1)
df = df.drop(["y0", "type", "b_score", "outs", "pitch_num", "b_count", "s_count", "on_1b", "on_2b", "on_3b"], axis = 1)
df = df.dropna()

In [19]:
print(df.shape)
df.columns

(283316, 25)


Index(['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot',
       'sz_top', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'z0', 'pfx_x', 'pfx_z',
       'nasty', 'code'],
      dtype='object')

In [5]:
# balance out the cases
strikes = df[df.code == 'S']
fouls = df[df.code == 'F']
fouls = fouls[0:len(strikes)]
tojoin = [strikes, fouls]
df = pd.concat(tojoin)
print(strikes.shape)
df.shape

(92995, 25)


(185990, 25)

In [22]:
with open("Models/scaled_models/mlp.pkl", "rb") as mlpr:
    mlp = pickle.load(mlpr)
with open("Models/unscaled_models/ab.pkl", "rb") as abr:
    ab = pickle.load(abr)
with open("Models/unscaled_models/gb.pkl", "rb") as gbr:
    gb = pickle.load(gbr)
with open("Models/unscaled_models/rf.pkl", "rb") as rfr:
    rf = pickle.load(rfr)
with open("Models/scaled_models/ab_scaled.pkl", "rb") as abr:
    ab_scaled = pickle.load(abr)
with open("Models/scaled_models/gb_scaled.pkl", "rb") as gbr:
    gb_scaled = pickle.load(gbr)
with open("Models/scaled_models/rf_scaled.pkl", "rb") as rfr:
    rf_scaled = pickle.load(rfr)
with open("Models/scaled_models/vc.pkl", "rb") as evcr:
    vc = pickle.load(evcr)

In [6]:
import sklearn.model_selection as tts
features_train, features_test, labels_train, labels_test = tts.train_test_split(df.transpose()[:24].transpose(), df.code, test_size = 0.3, random_state = 69)
# IMPORTANT: DO NOT TOUCH VAL UNTIL VALIDATION PHASE!!
features_test, features_val, labels_test, labels_val = tts.train_test_split(features_test, labels_test, test_size = 0.3, random_state = 420)

In [7]:
# Scaling data helps with MLPClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(features_train)
features_train_scaled = scaler.transform(features_train)
features_test_scaled = scaler.transform(features_test)
features_val_scaled = scaler.transform(features_val)

In [8]:
st = time.time()
mlp = MLPClassifier(random_state=1, max_iter=2000).fit(features_train_scaled, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 64.68092894554138 seconds


In [9]:
mlp.score(features_test_scaled, labels_test)

0.782113321555675

In [10]:
st = time.time()
rf = RandomForestClassifier().fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 51.82077193260193 seconds


In [11]:
rf.score(features_test, labels_test)

0.712548326804414

In [12]:
st = time.time()
rf_scaled = RandomForestClassifier().fit(features_train_scaled, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 53.34668278694153 seconds


In [13]:
rf_scaled.score(features_test_scaled, labels_test)

0.7161840387126508

In [14]:
st = time.time()
gb = GradientBoostingClassifier(loss='exponential', n_estimators = 1000).fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 670.0961346626282 seconds


In [15]:
gb.score(features_test, labels_test)

0.7807307268863456

In [16]:
st = time.time()
gb_scaled = GradientBoostingClassifier(loss='exponential', n_estimators = 1000).fit(features_train_scaled, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 660.1500608921051 seconds


In [17]:
gb_scaled.score(features_test_scaled, labels_test)

0.780705123281358

In [18]:
st = time.time()
ab = AdaBoostClassifier(n_estimators=1000).fit(features_train, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 523.5434520244598 seconds


In [19]:
ab.score(features_test, labels_test)

0.7544358245640986

In [20]:
st = time.time()
ab_scaled = AdaBoostClassifier(n_estimators=1000).fit(features_train_scaled, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 358.23775148391724 seconds


In [21]:
ab_scaled.score(features_test_scaled, labels_test)

0.7544358245640986

In [22]:
st = time.time()
vc = VotingClassifier(estimators=[('gbs', gb_scaled), ('abs', ab_scaled), ('rfs', rf_scaled), ('mlps', mlp)], voting = 'hard').fit(features_train_scaled, labels_train)
print("Fitted in %s seconds" % (time.time() - st))

Fitted in 1057.8590025901794 seconds


In [23]:
vc.score(features_test_scaled, labels_test)

0.7753795734439409

In [24]:
with open("Models/scaled_models/mlp.pkl", "wb") as mlpw:
    pickle.dump(mlp, mlpw)
with open("Models/unscaled_models/ab.pkl", "wb") as abw:
    pickle.dump(ab, abw)
with open("Models/unscaled_models/gb.pkl", "wb") as gbw:
    pickle.dump(gb, gbw)
with open("Models/unscaled_models/rf.pkl", "wb") as rfw:
    pickle.dump(rf, rfw)
with open("Models/scaled_models/ab_scaled.pkl", "wb") as abw:
    pickle.dump(ab_scaled, abw)
with open("Models/scaled_models/gb_scaled.pkl", "wb") as gbw:
    pickle.dump(gb_scaled, gbw)
with open("Models/scaled_models/rf_scaled.pkl", "wb") as rfw:
    pickle.dump(rf_scaled, rfw)
with open("Models/scaled_models/vc.pkl", "wb") as evcw:
    pickle.dump(vc, evcw)