In [81]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

In [106]:
import csv
import re
import json
import numpy as np

years = list(range(2016, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [107]:
max_count = 10
def get_tuples():
    game_data = []
    for year in years:
        print(year)
        filename = 'data/{}/{}_tuple_sentences.csv'.format(year, year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[0]
                values = re.findall(r"'(.*?)'", event)
                game_data.append(values)

    print(len(game_data))
    return game_data

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events, vocab_size):

    encodings = np.zeros((len(events),len(stat_encoding)))
    values = []
    print(encodings.shape)
    for i in range(len(events)):
        event = events[i]        
        stat = event[1]
        if stat in stat_encoding.keys():
            stat_encode = stat_encoding[stat]
            values.append(stat_encode - 1)
            encodings[i][stat_encode - 1] = 1


    return encodings, values

In [121]:
num_articles = 40000
# with open('players_2017.json') as df:
#     data = json.load(df)

vocab_size = len(stat_encoding)

events = get_tuples()
encodings, values = get_encodings(events[:num_articles], vocab_size)
print(vocab_size)
print(max_count)


2016
2017
42710
(40000, 13)
13
10


In [110]:
print(encodings[0], events[0])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] ['Love,Kevin', 'pts', '15']


In [122]:
def str_to_tup(string):
	game_data = []
	tokens = string.split(')')[:-1]
	for token in tokens:
		values = re.findall(r"'(.*?)'", token)
		game_data.append(values)

	return game_data

train_events = []

for year in years:
    train_event_sentences = pd.read_csv('data/{}/{}_tuple_sentences.csv'.format(year, year))
    train_event_sentences_sample = train_event_sentences[:num_articles]
    for index, row in train_event_sentences_sample.iterrows():
        train_events.append(str_to_tup(row['event']))
        train_articles.append(row['sentence'])

    

In [123]:
counts = dict()
for stat in stat_encoding.keys():
    counts[stat_encoding[stat] - 1] = 0
for value in values:
    counts[value] += 1
ordered = []
for key in stat_encoding.keys():
    print(key + ': ' + str(counts[stat_encoding[key] - 1]))
    ordered.append(counts[stat_encoding[key] - 1])
ordered.sort()    

fg: 3899
fg_pct: 0
fg3: 5806
fg3_pct: 0
ft: 4309
ft_pct: 0
pts: 19251
orb: 718
trb: 3952
ast: 1674
stl: 80
blk: 64
tov: 247


In [124]:
normalize = 1000 * len(years)
i = 0
while i < len(values):
    
    if counts[values[i]] > normalize:
        encodings = np.delete(encodings, i, 0)
        del train_articles[i]
        counts[values[i]] -= 1
        del values[i]
    else:
        i += 1

In [65]:
import re
article_size = 20
tokenized_articles = []
count = 0
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
#     end_point = min(article_size - 1, len(tokens))
    curr_article = []
    curr_article.append('_START_')
    for i in range(len(tokens)):
        count += 1
        curr_article.append(tokens[i])
    curr_article.append('_STOP_')
    count += 2
#     for i in range(end_point + 1, article_size):
#         curr_article.append('_PAD_')
    tokenized_articles.append(curr_article)
print(tokenized_articles[5])

['_START_', 'Love', 'had', '15', 'and', 'PLAYER_FIRST', 'PLAYER_LAST', 'STAT', 'in', 'his', 'debut', 'for', 'the', 'Cavs', ',', 'who', 'are', 'trying', 'to', 'blend', 'new', 'faces', 'and', 'big', 'egos', 'quickly', '.', '_STOP_']


In [66]:
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=1000, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(tokenized_articles)
num_words = len(tokenizer_decoder.word_index) + 1
y = tokenizer_decoder.texts_to_sequences(tokenized_articles)
# y_final = np.zeros((len(y), article_size, num_words))
# print(y_final.shape)
# for i in range(len(y)):

#     for j in range(len(y[i])):
        
#         y_final[i][j][y[i][j]] = 1.0
count

571361

In [67]:
X_final = np.zeros((count, len(stat_encoding) + 3 * tokenizer_decoder.num_words))
y_final = np.zeros((count, tokenizer_decoder.num_words))
for i in range(len(y)):
    curr = y[i]
    for j in range(len(curr) - 3):
        
        prev2_word =  curr[j]
        prev1_word =  curr[j + 1]
        prev_word =  curr[j + 2]
        curr_word = curr[j + 3]
        X_final[i][:len(stat_encoding)] = X[i]
        X_final[len(stat_encoding) + prev2_word] = 1
        X_final[len(stat_encoding) + tokenizer_decoder.num_words + prev1_word] = 1
        X_final[len(stat_encoding) + 2 * tokenizer_decoder.num_words + prev_word] = 1
        y_final[i][curr_word] = 1
        


In [68]:
X_final.shape

(571361, 3013)

In [69]:
y_final.shape

(571361, 1000)

In [11]:
# X = np.asarray(encodings)
# X_final = np.zeros((num_articles, article_size, vocab_size * max_count))
# print(X_final.shape)
# for i in range(num_articles):
#     for j in range(article_size):
#         X_final[i][j] = X[i]


In [78]:
model = Sequential()
model.add(Dense(1500, input_shape=(3013,), activation='relu'))
#model.add(LSTM(1000, return_sequences=True))
model.add(Dense(1000, activation="softmax"))
# Compile & run training
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 1500)              4521000   
_________________________________________________________________
dense_24 (Dense)             (None, 1000)              1501000   
Total params: 6,022,000
Trainable params: 6,022,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_final, y_final,
          batch_size=128,
          epochs=10,
          validation_split=0.1)

Train on 514224 samples, validate on 57137 samples
Epoch 1/10
Epoch 2/10

In [None]:
prediction = model.predict(X_final[1].reshape((1, 100, 5770)))
pred_words = [np.argmax(i) for i in prediction[0]]

index_word = {v: k for k, v in tokenizer_decoder.word_index.items()} 
article_pred = [index_word[i] for i in pred_words]
print(article_pred)