In [None]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

In [None]:
import csv
import re
import json
import numpy as np

years = list(range(2014, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [None]:
max_count = 10
def get_tuples():
    game_data = []
    articles = []
    for year in years:
        print(year)
        filename = 'data/{}/{}_tuple_sentences.csv'.format(year, year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[0]
                values = re.findall(r"'(.*?)'", event)
                sentence_split = line[1].split(" ")
                #if len(sentence_split) < 20:
                game_data.append(values)
                articles.append(line[1])

    print(len(game_data))
    return game_data, articles

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events, vocab_size):

    encodings = np.zeros((len(events),len(stat_encoding)))
    values = []
    print(encodings.shape)
    for i in range(len(events)):
        event = events[i]        
        stat = event[1]
        if stat in stat_encoding.keys():
            stat_encode = stat_encoding[stat]
            values.append(stat_encode - 1)
            encodings[i][stat_encode - 1] = 1


    return encodings, values

In [4]:
vocab_size = len(stat_encoding)

events, train_articles = get_tuples()
print(len(events), len(train_articles))
encodings, values = get_encodings(events, vocab_size)


2014
2015
2016
2017
51021
51021 51021
(51021, 13)


In [5]:
def str_to_tup(string):
	game_data = []
	tokens = string.split(')')[:-1]
	for token in tokens:
		values = re.findall(r"'(.*?)'", token)
		game_data.append(values)

	return game_data

# train_articles = []

# for year in years:
#     train_event_sentences = pd.read_csv('data/{}/{}_tuple_sentences.csv'.format(year, year))
#     for index, row in train_event_sentences.iterrows():
#         train_articles.append(row['sentence'])

In [8]:
counts = dict()
for stat in stat_encoding.keys():
    counts[stat_encoding[stat] - 1] = 0
for value in values:
    counts[value] += 1
ordered = []
for key in stat_encoding.keys():
    print(key + ': ' + str(counts[stat_encoding[key] - 1]))
    ordered.append(counts[stat_encoding[key] - 1])
ordered.sort()    

fg: 3095
fg_pct: 0
fg3: 3095
fg3_pct: 0
ft: 775
ft_pct: 0
pts: 3095
orb: 891
trb: 3095
ast: 3095
stl: 341
blk: 340
tov: 629


In [7]:
normalize = ordered[8]
i = 0
while i < len(values):
    
    if counts[values[i]] > normalize:
        encodings = np.delete(encodings, i, 0)
        del train_articles[i]
        counts[values[i]] -= 1
        del values[i]
    else:
        i += 1

In [None]:
counts = dict()
for stat in stat_encoding.keys():
    counts[stat_encoding[stat] - 1] = 0
for value in values:
    counts[value] += 1
ordered = []
for key in stat_encoding.keys():
    print(key + ': ' + str(counts[stat_encoding[key] - 1]))
    ordered.append(counts[stat_encoding[key] - 1])
ordered.sort()    

In [9]:
import re
article_size = 25
tokenized_articles = []
count = 0
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
    #end_point = min(article_size - 1, len(tokens))
    end_point = len(tokens)
    curr_article = []
    curr_article.append('_PAD_')
    curr_article.append('_PAD_')
    curr_article.append('_START_')
    for i in range(end_point):
        if i < article_size:
            count += 1
            curr_article.append(tokens[i])
    curr_article.append('_STOP_')
    count += 1
#     for i in range(end_point + 1, article_size):
#         curr_article.append('_PAD_')
    tokenized_articles.append(curr_article)

In [10]:
print(len(tokenized_articles))
article_array = np.asarray(tokenized_articles)
print(article_array[20])
print(encodings.shape)

18451
['_PAD_', '_PAD_', '_START_', 'PLAYER_LAST', 'committed', 'STAT', 'turnovers', 'and', 'fouled', 'out', 'late', 'in', 'the', 'game', '.', '_STOP_']
(18451, 13)


In [11]:
vocab_size = 2500
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=vocab_size, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(tokenized_articles)
#num_words = len(tokenizer_decoder.word_index) + 1
tokenizer_decoder.word_index = {e:i for e,i in tokenizer_decoder.word_index.items() if i <= vocab_size}
tokenizer_decoder.word_index[tokenizer_decoder.oov_token] = vocab_size + 1
y = tokenizer_decoder.texts_to_sequences(tokenized_articles)
num_words = len(tokenizer_decoder.word_index)
print(num_words)
print(np.asarray(y).shape)

2500
(18451,)


In [12]:
X_final = np.zeros((count, num_words * 3 + 13))
y_final = np.zeros((count, num_words))
i = 0
ind = 0
print(X_final.shape)
while i < len(encodings):
    j = 0
    while tokenized_articles[i][j+2] != '_STOP_':
        
#         prev3 = np.zeros((num_words,))
#         prev2 = np.zeros((num_words,))
#         prev1 = np.zeros((num_words,))
#         curr = np.zeros((num_words,))
#         prev3[y[i][j] - 1] = 1
#         prev2[y[i][j+1] - 1] = 1
#         prev1[y[i][j+2] - 1] = 1
#         curr[y[i][j+3] - 1] = 1
        
#         tmp = np.concatenate((encodings[i], prev3, prev2, prev1))
#         X_final.append(tmp)
#         y_final.append(curr)
        
        X_final[ind][:13] = encodings[i]
        X_final[ind][13 + y[i][j+2] - 2] = 1
        X_final[ind][13 + num_words + y[i][j+1] - 2] = 1
        X_final[ind][13 + num_words * 2 + y[i][j] - 2] = 1
        y_final[ind][y[i][j+3] - 2] = 1
    
        j += 1
        ind += 1
    
    i += 1
    if i%1000 == 0:
        print(i/len(encodings) * 100, '%')
        
# X_final = np.asarray(X_final)
# y_final = np.asarray(y_final)
X_final.shape

(342458, 7513)
5.419760446588261 %
10.839520893176521 %
16.259281339764783 %
21.679041786353043 %
27.098802232941306 %
32.518562679529566 %
37.93832312611782 %
43.358083572706086 %
48.77784401929435 %
54.19760446588261 %
59.61736491247087 %
65.03712535905913 %
70.4568858056474 %
75.87664625223564 %
81.29640669882392 %
86.71616714541217 %
92.13592759200043 %
97.5556880385887 %


(342458, 7513)

In [13]:
y_final.shape

(342458, 2500)

In [14]:
model = Sequential()

# model.add(GRU(100, input_shape=(article_size,13), return_sequences=True))
# model.add(GRU(100, return_sequences=True))
# model.add(TimeDistributed(Dense(num_words, activation='softmax')))

model.add(Dense(1000, input_shape = (num_words * 3 + 13,), activation = 'relu'))
model.add(Dense(num_words, activation = 'softmax'))
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000)              7514000   
_________________________________________________________________
dense_2 (Dense)              (None, 2500)              2502500   
Total params: 10,016,500
Trainable params: 10,016,500
Non-trainable params: 0
_________________________________________________________________


In [15]:
# train = encodings.reshape((encodings.shape[0], 13, 1))
# train = np.asarray([[i] * article_size for i in encodings])

model.fit(X_final, y_final,
          batch_size=256,
          epochs=5,
          validation_split=0.1)

Train on 308212 samples, validate on 34246 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ebedcce860>

In [36]:
index_category = {v: k for k, v in stat_encoding.items()} 
for i in range(12):
    print(index_category[i+1])
    event = np.zeros((13 + num_words * 3,1))
    event[i] = 1
    prev3 = tokenizer_decoder.word_index['_pad_']
    prev2 = tokenizer_decoder.word_index['_pad_']
    prev1 = tokenizer_decoder.word_index['_start_']
    index_word = {v: k for k, v in tokenizer_decoder.word_index.items()} 
    sentence = []
    for j in range(30):

        event[13 + prev1 - 2] = 1
        event[13 + num_words + prev2 - 2] = 1
        event[13 + num_words * 2 + prev3 - 2] = 1

        prediction = model.predict(event.reshape(1, 7513))
        curr = np.argmax(prediction) + 2
        word = index_word[curr]
        if word == '_stop_':
            break
        sentence.append(word)

        event[13 + prev1 - 2] = 0
        event[13 + num_words + prev2 - 2] = 0
        event[13 + num_words * 2 + prev3 - 2] = 0

        prev3 = prev2
        prev2 = prev1
        prev1 = curr
    print(sentence)


fg
['player_last', 'had', 'stat', 'rebounds', '.']
fg_pct
['player_first', 'player_last', 'had', 'stat', 'rebounds', 'and', 'stat', 'assists', '.']
fg3
['player_last', 'made', 'stat', '3', 'pointers', '.']
fg3_pct
['player_last', 'had', 'stat', 'rebounds', 'and', 'stat', 'assists', '.']
ft
['player_last', 'made', 'stat', 'free', 'throws', 'with', '_RARE_', 'left', '.']
ft_pct
['player_last', 'had', 'stat', 'rebounds', '.']
pts
['player_first', 'player_last', 'had', 'stat', 'points', 'and', '_stat_', 'rebounds', 'for', 'the', 'heat', ',', 'who', 'have', 'lost', 'seven', 'of', '10', '.']
orb
['player_first', 'player_last', 'had', '_stat_', 'rebounds', 'for', 'the', 'pistons', ',', 'who', 'have', 'lost', 'seven', 'of', '10', '.']
trb
['player_first', 'player_last', 'had', 'stat', 'rebounds', '.']
ast
['player_first', 'player_last', 'had', 'stat', 'assists', '.']
stl
['player_last', 'had', 'stat', 'steals', ',', 'and', 'player_first', 'player_last', 'had', 'stat', 'steals', ',', 'and', 'pl

In [16]:
tokenizer_decoder.word_index

{'_RARE_': 2501,
 '_pad_': 2,
 '_start_': 3,
 '_stop_': 4,
 '.': 5,
 'stat': 6,
 'the': 7,
 'player_last': 8,
 'and': 9,
 ',': 10,
 'had': 11,
 'player_first': 12,
 'rebounds': 13,
 'for': 14,
 'assists': 15,
 'with': 16,
 'of': 17,
 'in': 18,
 'a': 19,
 '_stat_': 20,
 'points': 21,
 'to': 22,
 'scored': 23,
 'on': 24,
 'added': 25,
 'from': 26,
 'his': 27,
 'finished': 28,
 '3': 29,
 'who': 30,
 'was': 31,
 'game': 32,
 'season': 33,
 'made': 34,
 'point': 35,
 '10': 36,
 'free': 37,
 'eight': 38,
 'high': 39,
 'first': 40,
 'seven': 41,
 'nine': 42,
 'six': 43,
 'but': 44,
 'five': 45,
 'pointers': 46,
 'led': 47,
 'while': 48,
 'turnovers': 49,
 '11': 50,
 'four': 51,
 '12': 52,
 'throws': 53,
 'minutes': 54,
 'shooting': 55,
 'at': 56,
 'games': 57,
 'three': 58,
 'after': 59,
 'double': 60,
 'quarter': 61,
 'two': 62,
 '15': 63,
 'by': 64,
 'career': 65,
 '14': 66,
 'night': 67,
 'field': 68,
 'shot': 69,
 'straight': 70,
 '13': 71,
 'also': 72,
 'lead': 73,
 '18': 74,
 'as': 75,
