In [1]:
import tensorflow as tf
import keras
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten, BatchNormalization, Embedding, LSTM, Bidirectional, GRU, TimeDistributed, Concatenate
from keras.optimizers import SGD
from keras.utils import np_utils
#from keras.preprocessing.text import one_hot

Using TensorFlow backend.


In [0]:
import csv
import re
import json
import numpy as np

years = list(range(2014, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

def get_tuples():
  game_data = []
  labels = []
  for year in years:
    print(year)
    filename = '{}_labeled_events.csv'.format(year)
    with open(filename) as inFile:
      csv_reader = csv.reader(inFile, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
          event = line[0]
          label = line[1]
          values = re.findall(r"'(.*?)'", event)
          name = values[0]
          value = values[2]
          if name not in team_map and value:
            labels.append(label)
            game_data.append(values)

  return game_data, labels

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
  one_hot = np.zeros(len(name_enc) + len(stat_enc))
  one_hot[name_num] = 1
  one_hot[(len(name_enc) - 1) + stat_num] = 1
  
  return one_hot

def get_encodings(events):
  with open('players.json') as df:
    data = json.load(df)

  one_hot_encodings = []
  values = []
  for event in events:
    name = event[0]
    stat = event[1]
    if name in data.keys() and stat in stat_encoding.keys():
      value = float(event[2])
      name_encode = data[name]
      stat_encode = stat_encoding[stat]

      one_hot = one_hot_int(name_encode, stat_encode, data, stat_encoding)

      one_hot_encodings.append(one_hot)
      values.append(value)

  return one_hot_encodings, values

In [0]:
def make_model(vocab_size, values):
    inputs = Input(shape = (vocab_size,))
    nn = Embedding(vocab_size, 200, input_length = vocab_size)(inputs)
    nn = Flatten()(nn)
    val_input = Input(shape=(1,))
    nn = Concatenate()([nn, val_input])
    nn = Dense(100, activation='relu')(nn)
    nn = Dense(100, activation='relu')(nn)
    y = Dense(1, activation='sigmoid')(nn)
    model = Model(inputs=[inputs, val_input], outputs=y)
    return model

In [4]:
with open('players.json') as df:
  data = json.load(df)

vocab_size = len(data) + len(stat_encoding)

events, labels = get_tuples()
one_hot_encodings, values = get_encodings(events)
values = np.asarray(values)
print(values.shape)
print(vocab_size)
model = make_model(vocab_size, values)
model.compile(optimizer="sgd", loss="binary_crossentropy", metrics=['binary_accuracy'])

2014
2015
2016
2017
(481858,)
1328


In [5]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1328)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1328, 200)    265600      input_1[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 265600)       0           embedding_1[0][0]                
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
concatenat

In [6]:
one_hot_encodings = np.asarray(one_hot_encodings)
labels = np.asarray(labels)
labels = labels.astype(np.int8)
print(one_hot_encodings.shape)
print(labels.shape)

print(np.sum(labels) / labels.shape[0])
print(values[:20])

(481858, 1328)
(481858,)
0.1300715148446223
[ 1.    29.     9.     0.     3.     0.2    5.     2.     2.     1.
  0.333 14.    14.     2.     0.     3.     0.     8.     0.     2.   ]


In [7]:
model.fit([one_hot_encodings, values], labels, batch_size=150, epochs=5, verbose=1, validation_split= 1/6, shuffle=True)

Train on 401548 samples, validate on 80310 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2a0a392240>

In [0]:
def precision(y_true, y_pred):
  
  tp = 0
  fp = 0
  print(len(y_true))
  for i in range(len(y_true)):
    if y_pred[i] > 0.6:
      y_pred[i] = 1
    else:
      y_pred[i] = 0

    if y_pred[i] == 1:
      
      if y_true[i] == 1:
        
        tp += 1
      
      else:
        
        fp += 1

  print(tp+fp)
  return tp/(tp + fp)

In [0]:
def get_predictions(y_probs, threshold=0.70):
  for i in range(len(y_probs)):
    if y_probs[i] > threshold:
      y_probs[i] = 1
    else:
      y_probs[i] = 0
      
  return y_probs

In [16]:
y_probs = model.predict([one_hot_encodings[:40000], values[:40000]])
y_pred = get_predictions(y_probs)
y_true = labels
precision(y_true[:40000], y_pred)

40000
2561


0.7133932057789926