In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, cross_validation
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib import layers



In [2]:
data = pd.read_csv('data.csv')
data.inning /= 9.0
data.order /= 9.0
data.weekday /= 7.0
data.month /= 12.0
data.balls /= 4.0
data.strikes /= 3.0
sample = data

In [3]:
cat_features = ['pitcher_id', 'batter_id', 'away_team', 'home_team', 
                'year', 'b_stand', 'p_throws', 'inning_half',  
                'batter_team', 'pitcher_team', 'type']
real_features = ['night', 'inning', 'order', 'home', 'weekday', 
                 'month', 'balls', 'strikes', 'sz_top', 'sz_bot']
n_cat = len(cat_features)
n_real = len(real_features)
ptypes = sample.type.unique()
depths = [len(data[col].unique()) for col in cat_features]

In [4]:
N = len(sample)
n_outputs = len(ptypes)
embedding_size = 30
prep = learn.preprocessing.CategoricalProcessor()
cat_data = np.array(list(prep.fit_transform(sample[cat_features])))
n_pitchers = cat_data[:,0].max() + 1
n_batters = cat_data[:,1].max() + 1

real_data = sample[real_features].values
loc_data = sample[['px','pz']].values

In [5]:
def learn_type():
    p_embeddings = tf.Variable(tf.random_uniform([n_pitchers, embedding_size], -1.0, 1.0))
    b_embeddings = tf.Variable(tf.random_uniform([n_batters, embedding_size], -1.0, 1.0))

    cat_batch = tf.placeholder(tf.int32, [None, n_cat-1])
    real_batch = tf.placeholder(tf.float32, [None, n_real])
    result_batch = tf.placeholder(tf.int32, [None])

    pitchers = cat_batch[:,0]
    batters = cat_batch[:,1]

    p_embed = tf.nn.embedding_lookup(p_embeddings, pitchers)
    b_embed = tf.nn.embedding_lookup(b_embeddings, batters)

    inputs = [p_embed, b_embed, real_batch]
    for i in range(2, n_cat-1):
        inputs.append(tf.one_hot(cat_batch[:,i], depth=depths[i],
                      on_value=1.0, off_value=0.0, dtype=tf.float32))
    input_layer = tf.concat(concat_dim=1, values=inputs)

    in_dim = input_layer.get_shape()[1]

    W1 = tf.Variable(tf.zeros([in_dim, 75]))
    b1 = tf.Variable(tf.zeros([75]))

    hidden = tf.nn.relu(tf.matmul(input_layer, W1) + b1)

    W2 = tf.Variable(tf.zeros([75, n_outputs]))
    b2 = tf.Variable(tf.zeros([n_outputs]))

    y = tf.nn.softmax(tf.matmul(hidden, W2) + b2)
    y_ = tf.one_hot(indices=result_batch - 1, depth=n_outputs, 
                    on_value=1.0, off_value=0.0, dtype=tf.float32)

    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
    train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)

    init = tf.initialize_all_variables()
    sess = tf.InteractiveSession()
    sess.run(init)
    batch_size = 500
    for i in range(5000):
        idx = np.random.randint(0, data.shape[0], batch_size)
        input_data = {cat_batch : cat_data[idx,:-1],
                                real_batch : real_data[idx], 
                                result_batch : cat_data[idx,-1]}
        sess.run(train_step, feed_dict=input_data)
        if i % 500 == 0:
            print(sess.run(cross_entropy, feed_dict=input_data))

In [6]:
def learn_loc():
    n_mixtures = 4
    p_embeddings = tf.Variable(tf.random_uniform([n_pitchers, embedding_size], -1.0, 1.0))
    b_embeddings = tf.Variable(tf.random_uniform([n_batters, embedding_size], -1.0, 1.0))

    cat_batch = tf.placeholder(tf.int32, [None, n_cat])
    real_batch = tf.placeholder(tf.float32, [None, n_real])
    loc_batch = tf.placeholder(tf.float32, [None, 2])

    n_items = tf.shape(cat_batch)[0]

    pitchers = cat_batch[:,0]
    batters = cat_batch[:,1]

    p_embed = tf.nn.embedding_lookup(p_embeddings, pitchers)
    b_embed = tf.nn.embedding_lookup(b_embeddings, batters)

    inputs = [p_embed, b_embed, real_batch]
    for i in range(2, n_cat):
        inputs.append(tf.one_hot(cat_batch[:,i], depth=depths[i],
                      on_value=1.0, off_value=0.0, dtype=tf.float32))
    input_layer = tf.concat(concat_dim=1, values=inputs)

    in_dim = input_layer.get_shape()[1]

    W1 = tf.Variable(tf.zeros([in_dim, 75]))
    b1 = tf.Variable(tf.random_normal([75], 0.0, .01)) + 0.03

    hidden = tf.nn.relu(tf.matmul(input_layer, W1) + b1)

    W2 = tf.Variable(tf.random_normal([75, n_mixtures], 0.0, .01))
    b2 = tf.Variable(tf.random_normal([n_mixtures], 0.0, .01))
    W3 = tf.Variable(tf.random_normal([75, n_mixtures*2], 0.0, .01))
    b3 = tf.Variable(tf.random_normal([n_mixtures*2], 0.0, .01))
    W4 = tf.Variable(tf.random_normal([75, n_mixtures*2], 0.0, .01))
    b4 = tf.Variable(tf.random_normal([n_mixtures*2], 0.0, .01))

    weights = tf.nn.softmax(tf.matmul(hidden, W2) + b2)
    means =  tf.reshape(tf.matmul(hidden, W3) + b3, tf.pack([n_items, n_mixtures, 2]))
    stdevs = tf.reshape(tf.exp(tf.matmul(hidden, W4) + b4), tf.pack([n_items, n_mixtures, 2]))

    def likelihood(i):
        normals = tf.contrib.distributions.MultivariateNormalDiag(means[i], stdevs[i])
        return tf.reduce_sum(weights[i] * normals.pdf(loc_batch[i]))

    likelihoods = tf.map_fn(likelihood, tf.range(0, n_items), dtype=tf.float32)
    loglike = tf.reduce_mean(tf.log(likelihoods))

    train_step = tf.train.AdamOptimizer(0.0005).minimize(-loglike)
    init = tf.initialize_all_variables()
    
    sess = tf.InteractiveSession()
    sess.run(init)
    
    batch_size = 500
    for i in range(1000):
        idx = np.random.randint(0, data.shape[0], batch_size)
        input_data = feed_dict={cat_batch : cat_data[idx],
                                real_batch : real_data[idx], 
                                loc_batch : loc_data[idx]}
        if i % 1 == 0:
            print(sess.run(loglike, feed_dict = input_data)) 
        sess.run(train_step, feed_dict = input_data)

In [None]:
learn_type()

In [7]:
learn_loc()

-5.27413
-5.39169
-5.30898
-5.33108
-5.26724
-5.197
-5.23343
-5.11935
-5.0366
-4.94741
-5.03017
-4.79619
-4.7635
-4.84038
-4.80169
-4.65245
-4.56762
-4.58284
-4.55882
-4.47677
-4.28633
-4.39069
-4.27964
-4.28338
-4.04111
-4.17508
-4.02758
-3.95397
-3.86843
-3.91959
-3.7589
-3.75697
-3.7368
-3.59165
-3.66602
-3.66856
-3.65108
-3.52799
-3.42759
-3.42755
-3.53687
-3.43937
-3.36024
-3.42647
-3.36767
-3.46017
-3.38282
-3.37456
-3.38673
-3.34057
-3.34022
-3.30281
-3.30966
-3.30184
-3.30683
-3.32054
-3.31661
-3.29127
-3.21946
-3.32833
-3.25991
-3.18458
-3.33239
-3.28766
-3.2085
-3.26279
-3.19365
-3.2248
-3.2434
-3.21665
-3.15583
-3.17663
-3.2054
-3.18445
-3.14716
-3.15606
-3.1704
-3.07541
-3.13651
-3.08632
-3.0482
-3.15193
-3.03296
-2.99706
-3.01844
-2.9888
-2.9597
-2.99504
-2.96842
-2.9246
-2.91749
-2.89767
-2.84875
-2.8547
-2.81563
-2.76879
-2.8489
-2.78997
-2.74561
-2.75972
-2.70958
-2.65353
-2.70319
-2.66975
-2.6998
-2.71439
-2.66561
-2.61665
-2.70766
-2.67288
-2.7082
-2.6835
-2.68394
-2.

KeyboardInterrupt: 

In [120]:
cat_data[:,-1].min()

1