In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, cross_validation
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib import layers



In [2]:
data = pd.read_csv('data.csv')
data.inning /= 9.0
data.order /= 9.0
data.weekday /= 7.0
data.month /= 12.0
data.balls /= 4.0
data.strikes /= 3.0
sample = data

In [3]:
cat_features = ['pitcher_id', 'batter_id', 'away_team', 'home_team', 
                'year', 'b_stand', 'p_throws', 'inning_half',  
                'batter_team', 'pitcher_team', 'type']
real_features = ['night', 'inning', 'order', 'home', 'weekday', 
                 'month', 'balls', 'strikes', 'sz_top', 'sz_bot']
n_cat = len(cat_features)
n_real = len(real_features)
ptypes = sample.type.unique()
depths = [len(data[col].unique()) for col in cat_features]

In [4]:
N = len(sample)
n_outputs = len(ptypes)
embedding_size = 30
prep = learn.preprocessing.CategoricalProcessor()
cat_data = np.array(list(prep.fit_transform(sample[cat_features])))
n_pitchers = cat_data[:,0].max() + 1
n_batters = cat_data[:,1].max() + 1

real_data = sample[real_features].values
loc_data = sample[['px','pz']].values

In [121]:
def learn_type():
    p_embeddings = tf.Variable(tf.random_uniform([n_pitchers, embedding_size], -1.0, 1.0))
    b_embeddings = tf.Variable(tf.random_uniform([n_batters, embedding_size], -1.0, 1.0))

    cat_batch = tf.placeholder(tf.int32, [None, n_cat-1])
    real_batch = tf.placeholder(tf.float32, [None, n_real])
    result_batch = tf.placeholder(tf.int32, [None])

    pitchers = cat_batch[:,0]
    batters = cat_batch[:,1]

    p_embed = tf.nn.embedding_lookup(p_embeddings, pitchers)
    b_embed = tf.nn.embedding_lookup(b_embeddings, batters)

    inputs = [p_embed, b_embed, real_batch]
    for i in range(2, n_cat-1):
        inputs.append(tf.one_hot(cat_batch[:,i], depth=depths[i],
                      on_value=1.0, off_value=0.0, dtype=tf.float32))
    input_layer = tf.concat(concat_dim=1, values=inputs)

    in_dim = input_layer.get_shape()[1]

    W1 = tf.Variable(tf.zeros([in_dim, 75]))
    b1 = tf.Variable(tf.zeros([75]))

    hidden = tf.nn.relu(tf.matmul(input_layer, W1) + b1)

    W2 = tf.Variable(tf.zeros([75, n_outputs]))
    b2 = tf.Variable(tf.zeros([n_outputs]))

    y = tf.nn.softmax(tf.matmul(hidden, W2) + b2)
    y_ = tf.one_hot(indices=result_batch - 1, depth=n_outputs, 
                    on_value=1.0, off_value=0.0, dtype=tf.float32)

    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
    train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)

    init = tf.initialize_all_variables()
    sess = tf.InteractiveSession()
    sess.run(init)
    batch_size = 500
    for i in range(5000):
        idx = np.random.randint(0, data.shape[0], batch_size)
        input_data = {cat_batch : cat_data[idx,:-1],
                                real_batch : real_data[idx], 
                                result_batch : cat_data[idx,-1]}
        sess.run(train_step, feed_dict=input_data)
        if i % 500 == 0:
            print(sess.run(cross_entropy, feed_dict=input_data))

In [117]:
def learn_loc():
    n_mixtures = 4
    p_embeddings = tf.Variable(tf.random_uniform([n_pitchers, embedding_size], -1.0, 1.0))
    b_embeddings = tf.Variable(tf.random_uniform([n_batters, embedding_size], -1.0, 1.0))

    cat_batch = tf.placeholder(tf.int32, [None, n_cat])
    real_batch = tf.placeholder(tf.float32, [None, n_real])
    loc_batch = tf.placeholder(tf.float32, [None, 2])

    n_items = tf.shape(cat_batch)[0]

    pitchers = cat_batch[:,0]
    batters = cat_batch[:,1]

    p_embed = tf.nn.embedding_lookup(p_embeddings, pitchers)
    b_embed = tf.nn.embedding_lookup(b_embeddings, batters)

    inputs = [p_embed, b_embed, real_batch]
    for i in range(2, n_cat):
        inputs.append(tf.one_hot(cat_batch[:,i], depth=depths[i],
                      on_value=1.0, off_value=0.0, dtype=tf.float32))
    input_layer = tf.concat(concat_dim=1, values=inputs)

    in_dim = input_layer.get_shape()[1]

    W1 = tf.Variable(tf.zeros([in_dim, 75]))
    b1 = tf.Variable(tf.random_normal([75], 0.0, .01)) + 0.03

    hidden = tf.nn.relu(tf.matmul(input_layer, W1) + b1)

    W2 = tf.Variable(tf.random_normal([75, n_mixtures], 0.0, .01))
    b2 = tf.Variable(tf.random_normal([n_mixtures], 0.0, .01))
    W3 = tf.Variable(tf.random_normal([75, n_mixtures*2], 0.0, .01))
    b3 = tf.Variable(tf.random_normal([n_mixtures*2], 0.0, .01))
    W4 = tf.Variable(tf.random_normal([75, n_mixtures*2], 0.0, .01))
    b4 = tf.Variable(tf.random_normal([n_mixtures*2], 0.0, .01))

    weights = tf.nn.softmax(tf.matmul(hidden, W2) + b2)
    means =  tf.reshape(tf.matmul(hidden, W3) + b3, tf.pack([n_items, n_mixtures, 2]))
    stdevs = tf.reshape(tf.exp(tf.matmul(hidden, W4) + b4), tf.pack([n_items, n_mixtures, 2]))

    def likelihood(i):
        normals = tf.contrib.distributions.MultivariateNormalDiag(means[i], stdevs[i])
        return tf.reduce_sum(weights[i] * normals.pdf(loc_batch[i]))

    likelihoods = tf.map_fn(likelihood, tf.range(0, n_items), dtype=tf.float32)
    loglike = tf.reduce_mean(tf.log(likelihoods))

    train_step = tf.train.AdamOptimizer(0.0005).minimize(-loglike)
    init = tf.initialize_all_variables()
    
    sess = tf.InteractiveSession()
    sess.run(init)
    
    batch_size = 500
    for i in range(1000):
        idx = np.random.randint(0, data.shape[0], batch_size)
        input_data = feed_dict={cat_batch : cat_data[idx],
                                real_batch : real_data[idx], 
                                loc_batch : loc_data[idx]}
        if i % 1 == 0:
            print(sess.run(loglike, feed_dict = input_data)) 
        sess.run(train_step, feed_dict = input_data)

In [None]:
learn_type()

In [118]:
learn_loc()

-5.23907
-5.03791
-5.15264
-5.10832
-5.04511
-4.96719
-4.97253
-4.80116
-4.65361
-4.8225
-4.6065
-4.64121
-4.64014
-4.43933
-4.26636
-4.28168
-4.11485
-4.14881
-4.1263
-4.08372
-3.97546
-3.86109
-3.74915
-3.64913
-3.72216
-3.62633
-3.55088
-3.56211
-3.54661
-3.49627
-3.45378
-3.43194
-3.33364
-3.43077
-3.44046
-3.34775
-3.32243
-3.30961
-3.36588
-3.34463
-3.31281
-3.35469
-3.31403
-3.23277
-3.24329
-3.31123
-3.32127
-3.2536
-3.24097
-3.28541
-3.29158
-3.24402
-3.26904
-3.21895
-3.21912
-3.23158
-3.22876
-3.18937
-3.14893
-3.19317
-3.14052
-3.14486
-3.09174
-3.12899
-3.12544
-3.0885
-3.06068
-3.09494
-3.03889
-3.0687
-2.97176
-3.02867
-2.9992
-2.97577
-3.02054
-2.94489
-2.89979
-2.88469
-2.84248
-2.86562
-2.86496
-2.88103
-2.81906
-2.79203
-2.73022
-2.75945
-2.69328
-2.68007
-2.67807
-2.66063
-2.59185
-2.66243
-2.67212
-2.65078
-2.78911
-2.67577
-2.67409
-2.64795
-2.74988
-2.72463
-2.62984
-2.67303
-2.57196
-2.54297
-2.61875
-2.62585
-2.70295
-2.56444
-2.70178
-2.58247
-2.63766
-2.62987

In [120]:
cat_data[:,-1].min()

1