In [4]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

In [5]:
import csv
import re
import json
import numpy as np

years = list(range(2017, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [9]:
max_count = 10
def get_tuples():
    game_data = []
    for year in years:
        print(year)
        filename = '{}_tuple_sentences.csv'.format(year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[0]
                values = re.findall(r"'(.*?)'", event)
                game_data.append(values)

    print(len(game_data))
    return game_data

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events, vocab_size):

    encodings = np.zeros((len(events),len(stat_encoding)))
    print(encodings.shape)
    for i in range(len(events)):
        event = events[i]
      
        stat = event[1]
        if stat in stat_encoding.keys():
            stat_encode = stat_encoding[stat]

            encodings[i][stat_encode - 1] = 1


    return encodings

In [11]:
num_articles = 20000
# with open('players_2017.json') as df:
#     data = json.load(df)

vocab_size = len(stat_encoding)

events = get_tuples()
encodings = get_encodings(events[:num_articles], vocab_size)
print(vocab_size)
print(max_count)


2017
21354
(20000, 13)
13
10


In [13]:
print(encodings[0], events[0])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] ['Irving,Kyrie', 'pts', '22']


In [14]:
def str_to_tup(string):
	game_data = []
	tokens = string.split(')')[:-1]
	for token in tokens:
		values = re.findall(r"'(.*?)'", token)
		game_data.append(values)

	return game_data

train_event_sentences = pd.read_csv('2017_tuple_sentences.csv')
train_articles = []
train_events = []
train_event_sentences_sample = train_event_sentences[:num_articles]
for index, row in train_event_sentences_sample.iterrows():
    train_events.append(str_to_tup(row['event']))
    train_articles.append(row['sentence'])

    

In [17]:
import re
article_size = 25
tokenized_articles = [] 
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
    end_point = min(article_size - 1, len(tokens))
    curr_article = []
    for i in range(end_point):
        curr_article.append(tokens[i])
    curr_article.append('_STOP_')
    for i in range(end_point + 1, article_size):
        curr_article.append('_PAD_')
    tokenized_articles.append(curr_article)
print(tokenized_articles[5])

['NEW', 'GUYS', 'In', 'his', 'Cleveland', 'debut', ',', 'Dwyane', 'Wade', 'scored', 'eight', 'points', 'on', '3', 'of', '10', 'shooting', 'in', '28', 'minutes', '.', '_STOP_', '_PAD_', '_PAD_', '_PAD_']


In [18]:
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=5000, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(tokenized_articles)
num_words = len(tokenizer_decoder.word_index) + 1
y = tokenizer_decoder.texts_to_sequences(tokenized_articles)
y_final = np.zeros((len(y), article_size, num_words))
print(y_final.shape)
for i in range(len(y)):

    for j in range(len(y[i])):
        
        y_final[i][j][y[i][j]] = 1.0


(20000, 25, 5214)


In [19]:
X = np.asarray(encodings)
X.shape

(20000, 13)

In [10]:
X = np.asarray(encodings)
X_final = np.zeros((num_articles, article_size, vocab_size * max_count))
print(X_final.shape)
for i in range(num_articles):
    for j in range(article_size):
        X_final[i][j] = X[i]


(1000, 100, 5770)


In [0]:
X[0] == X_final[0][5]

array([ True,  True,  True, ...,  True,  True,  True])

In [11]:
model = Sequential()
model.add(LSTM(500, input_shape=(100, 5770), return_sequences=True))
model.add(LSTM(500, return_sequences=True))
model.add(Dense(4132, activation="softmax"))
# Compile & run training
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 500)          12542000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 500)          2002000   
_________________________________________________________________
dense_1 (Dense)              (None, 100, 4132)         2070132   
Total params: 16,614,132
Trainable params: 16,614,132
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(X_final, y_final,
          batch_size=50,
          epochs=10,
          validation_split=0.1)

Train on 900 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9b97956470>

In [22]:
prediction = model.predict(X_final[1].reshape((1, 100, 5770)))
pred_words = [np.argmax(i) for i in prediction[0]]

index_word = {v: k for k, v in tokenizer_decoder.word_index.items()} 
article_pred = [index_word[i] for i in pred_words]
print(article_pred)

['the', 'had', 'scored', 'points', 'points', 'points', 'points', 'points', 'and', 'and', 'and', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
