In [134]:
import torch
import tqdm
import copy
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from data import get_data, pitcher_class
from torch.utils.data import DataLoader, TensorDataset

In [135]:
model = nn.Sequential(
    nn.Linear(20, 40),
    nn.Linear(40, 20),
    nn.Linear(20, 10),
    nn.Linear(10, 5),
    nn.Linear(5,1)
)

In [136]:
p_throws_map = {'R': 0, 'L': 1}
pt_data = get_data.get_pitcher_data()
p_dict = pitcher_class.format_pitcher_data(pt_data)
new_ds = pitcher_class.process_batch_avg(p_dict)

This is a large query, it may take a moment to complete


100%|██████████| 108/108 [00:14<00:00,  7.38it/s]


In [137]:
rv_data = get_data.get_runvalue_data()
full_ds = get_data.join_ds(new_ds, rv_data) #join data

In [138]:
# Apply one-hot encoding to pitches, apply binary to L/R
one_hot = pd.get_dummies(full_ds['pitch_name'])
full_ds = full_ds.drop('pitch_name',axis = 1)
full_ds = full_ds.join(one_hot)
full_ds['p_throws'] = [p_throws_map[x] for x in full_ds['p_throws']]

In [139]:
#rearrange cols
full_ds = full_ds[['player_name','p_throws','Splitter', '4-Seamer', 'Curveball', 'Cutter', 'Sinker', 'Changeup', 
                  'Slider', 'Sweeper', 'Slurve', 'Forkball', 'Screwball', 'Knuckleball','release_speed','release_pos_x',
                  'release_pos_z','pfx_x','pfx_z','release_extension','spin_axis','useage %','pitches','whiff_percent','est_woba',
                  'run_value_per_100']]
lg_avs = get_data.league_avgs(full_ds)

In [140]:
new_plus = get_data.generate_plus(full_ds,lg_avs)
full_ds['calc_plus'] = new_plus
full_ds.drop(columns=['whiff_percent','est_woba','run_value_per_100'])
full_ds = full_ds[full_ds['pitches'] >= 20]

In [141]:
init_inp = full_ds[['p_throws','Splitter', '4-Seamer', 'Curveball', 'Cutter', 'Sinker', 'Changeup', 
                  'Slider', 'Sweeper', 'Slurve', 'Forkball', 'Screwball', 'Knuckleball','release_speed','release_pos_x',
                  'release_pos_z','pfx_x','pfx_z','release_extension','spin_axis']]
X = torch.tensor(init_inp.to_numpy())
X = X.type(torch.float32)
init_out = full_ds[['calc_plus']]
Y = torch.tensor(init_out.to_numpy())
Y = Y.type(torch.float32)

In [142]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=.66, random_state=42)

In [144]:

mse_loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=.5)

In [145]:
train_data = TensorDataset(X_train, Y_train)
test_data = TensorDataset(X_test, Y_test)
train_loader = DataLoader(train_data, shuffle=True, batch_size=1000)
test_loader = DataLoader(test_data, batch_size=len(test_data.tensors[0]))

In [154]:
n_epochs = 10000   # number of epochs to run
batch_size = 2000  # size of each batch
batch_start = torch.arange(0, len(X_train), batch_size)
 
# Hold the best model
best_mse = np.inf   # init to infinity
best_weights = None
history = []

for epoch in range(n_epochs):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = X_train[start:start+batch_size]
            y_batch = Y_train[start:start+batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = mse_loss(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mse=float(loss))
    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = mse_loss(y_pred, Y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

In [175]:
# init_inp = full_ds[['p_throws','Splitter', '4-Seamer', 'Curveball', 'Cutter', 'Sinker', 'Changeup', 
#                   'Slider', 'Sweeper', 'Slurve', 'Forkball', 'Screwball', 'Knuckleball','release_speed','release_pos_x',
#                   'release_pos_z','pfx_x','pfx_z','release_extension','spin_axis']]
# X = torch.tensor(init_inp.to_numpy())
# X = X.type(torch.float32)

# state_dict = torch.load('model.pt')
# newmodel = model(X)
# newmodel.load_state_dict(state_dict)
# newmodel.eval()
import matplotlib.pyplot as plt
model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
model.eval()

ret = pd.DataFrame(columns = ['pitcher_name', 'pitch_type', 'stuff+'])
for index,row in full_ds.iterrows():
    pitcher = row['player_name']
    pitch_dict = {x: row[x] for x in row.index[2:14]}
    pitch_type = None
    for key in pitch_dict:
        if pitch_dict[key] == 1:
            pitch_type = key
            break
    inp = row[1:21]
    pred = model(torch.tensor(inp))[0].item() + 100 
    ret.loc[len(ret)] = [pitcher, pitch_type, pred]
ret.to_csv('init_test.csv')

MSE: 7039.55
RMSE: 83.90
