In [23]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

In [24]:
import csv
import re
import json
import numpy as np

years = list(range(2010, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [37]:
max_count = 10
def get_tuples():
    game_data = []
    articles = []
    for year in years:
        print(year)
        filename = 'data/{}/{}_tuple_sentences.csv'.format(year, year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[0]
                values = re.findall(r"'(.*?)'", event)
                sentence_split = line[1].split(" ")
                #if len(sentence_split) < 20:
                game_data.append(values)
                articles.append(line[1])

    print(len(game_data))
    return game_data, articles

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events, vocab_size):

    encodings = np.zeros((len(events),len(stat_encoding)))
    values = []
    print(encodings.shape)
    for i in range(len(events)):
        event = events[i]        
        stat = event[1]
        if stat in stat_encoding.keys():
            stat_encode = stat_encoding[stat]
            values.append(stat_encode - 1)
            encodings[i][stat_encode - 1] = 1


    return encodings, values

In [38]:
vocab_size = len(stat_encoding)

events, train_articles = get_tuples()
print(len(events), len(train_articles))
encodings, values = get_encodings(events, vocab_size)


2010
2011
2012
2013
2014
2015
2016
2017
101856
101856 101856
(101856, 13)


In [39]:
print(encodings[0], events[0])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] ['Rose,Derrick', 'pts', '14']


In [40]:
def str_to_tup(string):
	game_data = []
	tokens = string.split(')')[:-1]
	for token in tokens:
		values = re.findall(r"'(.*?)'", token)
		game_data.append(values)

	return game_data

# train_articles = []

# for year in years:
#     train_event_sentences = pd.read_csv('data/{}/{}_tuple_sentences.csv'.format(year, year))
#     for index, row in train_event_sentences.iterrows():
#         train_articles.append(row['sentence'])

In [41]:
counts = dict()
for stat in stat_encoding.keys():
    counts[stat_encoding[stat] - 1] = 0
for value in values:
    counts[value] += 1
ordered = []
for key in stat_encoding.keys():
    print(key + ': ' + str(counts[stat_encoding[key] - 1]))
    ordered.append(counts[stat_encoding[key] - 1])
ordered.sort()    

fg: 11760
fg_pct: 0
fg3: 19352
fg3_pct: 0
ft: 1400
ft_pct: 0
pts: 46472
orb: 1720
trb: 12232
ast: 6328
stl: 704
blk: 696
tov: 1192


In [42]:
normalize = ordered[8]
i = 0
while i < len(values):
    
    if counts[values[i]] > normalize:
        encodings = np.delete(encodings, i, 0)
        del train_articles[i]
        counts[values[i]] -= 1
        del values[i]
    else:
        i += 1

In [43]:
import re
article_size = 25
tokenized_articles = []
count = 0
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
    end_point = min(article_size - 1, len(tokens))
    curr_article = []
    for i in range(end_point):
        if i < article_size:
            count += 1
            curr_article.append(tokens[i])
    curr_article.append('_STOP_')
    count += 2
    for i in range(end_point + 1, article_size):
        curr_article.append('_PAD_')
    tokenized_articles.append(curr_article)

In [44]:
print(len(tokenized_articles))
article_array = np.asarray(tokenized_articles)
print(article_array[20])
print(encodings.shape)

37352
['PLAYER_FIRST' 'PLAYER_LAST' 'was' 'held' 'to' 'STAT' 'on' 'free'
 'throws' ',' 'shooting' '0' 'for' '6' 'and' 'missing' 'all' 'five' 'of'
 'his' '3s' '.' '_STOP_' '_PAD_' '_PAD_']
(37352, 13)


In [45]:
vocab_size = 2500
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=vocab_size, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(tokenized_articles)
#num_words = len(tokenizer_decoder.word_index) + 1
tokenizer_decoder.word_index = {e:i for e,i in tokenizer_decoder.word_index.items() if i <= vocab_size}
tokenizer_decoder.word_index[tokenizer_decoder.oov_token] = vocab_size + 1
y = tokenizer_decoder.texts_to_sequences(tokenized_articles)
num_words = len(tokenizer_decoder.word_index) + 2
print(num_words)
print(np.asarray(y).shape)

2502
(37352, 25)


In [46]:
y_final = np.zeros((len(y), article_size, num_words))
print(y_final.shape)
y = np.asarray(y)
print(y.shape)
print(len(y))
print(len(y[0]))
for i in range(len(y)):
    for j in range(len(y[i])):
        y_final[i][j][y[i][j]] = 1.0

print(y_final.shape)

(37352, 25, 2502)
(37352, 25)
37352
25
(37352, 25, 2502)


In [47]:
model = Sequential()

model.add(GRU(100, input_shape=(25,13), return_sequences=True))
model.add(GRU(100, return_sequences=True))
model.add(TimeDistributed(Dense(num_words, activation='softmax')))

model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_5 (GRU)                  (None, 25, 100)           34200     
_________________________________________________________________
gru_6 (GRU)                  (None, 25, 100)           60300     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 25, 2502)          252702    
Total params: 347,202
Trainable params: 347,202
Non-trainable params: 0
_________________________________________________________________


In [48]:
train = encodings.reshape((encodings.shape[0], 13, 1))
train = np.asarray([[i] * 25 for i in encodings])

model.fit(train, y_final,
          batch_size=128,
          epochs=5,
          validation_split=0.1)

Train on 33616 samples, validate on 3736 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a35c3d470>

In [53]:
# for i in train:
#     if i[0][5] == 1:
#         event_to_predict = i.reshape((1, 25, 13))
#         break

event_to_predict = train[25].reshape((1, 25, 13))
prediction = model.predict(event_to_predict)
print(event_to_predict)
pred_words = [np.argmax(i) for i in prediction[0]]

index_word = {v: k for k, v in tokenizer_decoder.word_index.items()} 
article_pred = [index_word[i] for i in pred_words]
print(article_pred)
print(article_array[25])

[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0.