In [1]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [37]:
# configure
num_encoder_tokens = 25
num_decoder_tokens = 1000
latent_dim = 256

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None, 25)     0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, None, 1000)   0                                            
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 256), (None, 288768      input_13[0][0]                   
__________________________________________________________________________________________________
lstm_14 (LSTM)                  [(None, None, 256),  1287168     input_14[0][0]                   
                                                                 lstm_13[0][1]                    
          

In [None]:
import csv
import re
import json
import numpy as np

years = list(range(2014, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [140]:
np.asarray(encodings).shape

(1230, 34, 1329)

In [137]:
def get_tuples():
    game_data = []
    for year in years:
        print(year)
        filename = '{}_labeled_events_sentences.csv'.format(year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[2]
                values = re.findall(r"'(.*?)'", event)
                game_data.append(values)

    return game_data

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events):
    with open('players.json') as df:
        data = json.load(df)

    encodings = []
    max_count = 0
    for event in events:
        encoding = []
        if len(event)/3 > max_count:
            max_count = len(event)/3
        for i in range(0,len(event),3):
            name = event[i]
            stat = event[i+1]
            if name in data.keys() and stat in stat_encoding.keys():
                value = float(event[i+2])
                name_encode = data[name]
                stat_encode = stat_encoding[stat]

                one_hot = one_hot_int(name_encode, stat_encode, data, stat_encoding)
                one_hot[len(one_hot) - 1] = value
                encoding.append(one_hot)
        encodings.append(encoding)

    return encodings, max_count

In [138]:
with open('players.json') as df:
    data = json.load(df)

vocab_size = len(data) + len(stat_encoding)

events = get_tuples()
encodings, max_count = get_encodings(events)
print(np.asarray(encodings).shape)
print(vocab_size)
print(max_count)

2017
(1230,)
1328
34.0


In [139]:
for encoding in encodings:
    j = len(encoding)
    while j < max_count:
        encoding.append(np.zeros(len(data) + len(stat_encoding) + 1))
        j += 1

In [67]:
import preprocess

train_event_sentences = pd.read_csv('data/2017/2017_labeled_events_sentences.csv')
train_articles = []
train_events = []
for index, row in train_event_sentences.iterrows():
    train_events.append(preprocess.str_to_tup(row['event']))
    train_articles.append('START ' + row['sentence'])
    

In [102]:
# tokenizer_decoder = Tokenizer(char_level = False, num_words=5000)
# tokenizer_decoder.fit_on_texts(train_articles)

# train_output_data = tokenizer_decoder.texts_to_sequences(train_articles)
# print(len(train_articles[0]))
# print(len(train_output_data[0]))
len(encodings)

902

In [141]:
import re
article_inp = []
article_out = []
encoding_input_data = []
j = 0
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
        
    for i in range(len(tokens) - 1):
        inp = tokens[i]
        out = tokens[i + 1]
        article_inp.append(inp)
        article_out.append(out)
        encoding_input_data.append(encodings[j])
    j+=1
print(article_inp[7:13])
print(article_out[7:13])

['points', 'and', '10', 'rebounds', '.', 'Hayward']
['and', '10', 'rebounds', '.', 'Hayward', ',']


In [142]:
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=5000, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(train_articles)
decoder_input_data = tokenizer_decoder.texts_to_sequences(article_inp)
decoder_output_data = tokenizer_decoder.texts_to_sequences(article_out)

In [143]:
print(np.asarray(encoding_input_data).shape)

MemoryError: 

In [74]:
decoder_input_data[:10]

[[12],
 [12, 194],
 [12, 194, 186],
 [12, 194, 186, 671],
 [12, 194, 186, 671, 465],
 [12, 194, 186, 671, 465, 7],
 [12, 194, 186, 671, 465, 7, 33],
 [12, 194, 186, 671, 465, 7, 33, 3],
 [12, 194, 186, 671, 465, 7, 33, 3, 2],
 [12, 194, 186, 671, 465, 7, 33, 3, 2, 18]]

In [51]:
one_hot_encodings

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 1., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 

In [96]:
num_encoder_tokens = 34
num_decoder_tokens = len(tokenizer_decoder.word_index) + 1
latent_dim = 256

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 25)     0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    3391744     input_3[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 288768      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LS

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)