In [20]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf

In [29]:
import csv
import re
import json
import numpy as np

years = list(range(2017, 2018))

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

stat_encoding = {'fg': 1, 'fg_pct': 2, 'fg3': 3, 'fg3_pct': 4, 'ft': 5, 'ft_pct': 6, 'pts': 7, 'orb': 8,
                'trb': 9, 'ast': 10, 'stl': 11, 'blk': 12, 'tov': 13}

In [38]:
max_count = 10
def get_tuples():
    game_data = []
    for year in years:
        print(year)
        filename = 'data/{}/{}_tuple_sentences.csv'.format(year, year)
        with open(filename) as inFile:
            csv_reader = csv.reader(inFile, delimiter=',')
            next(csv_reader)
            for line in csv_reader:
                event = line[0]
                values = re.findall(r"'(.*?)'", event)
                game_data.append(values)

    print(len(game_data))
    return game_data

def one_hot_int(name_num, stat_num, name_enc, stat_enc):
    one_hot = np.zeros(len(name_enc) + len(stat_enc)+1)
    one_hot[name_num] = 1
    one_hot[(len(name_enc) - 1) + stat_num] = 1
  
    return one_hot

def get_encodings(events, vocab_size):

    encodings = np.zeros((len(events),len(stat_encoding)))
    print(encodings.shape)
    for i in range(len(events)):
        event = events[i]        

        stat = event[1]
        if stat in stat_encoding.keys():
            stat_encode = stat_encoding[stat]

            encodings[i][stat_encode - 1] = 1


    return encodings

In [39]:
num_articles = 20000
# with open('players_2017.json') as df:
#     data = json.load(df)

vocab_size = len(stat_encoding)

events = get_tuples()
encodings = get_encodings(events[:num_articles], vocab_size)
print(vocab_size)
print(max_count)


2017
21355
(20000, 13)
13
10


In [40]:
print(encodings[0], events[0])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] ['Love,Kevin', 'pts', '15']


In [43]:
year = 2017
def str_to_tup(string):
	game_data = []
	tokens = string.split(')')[:-1]
	for token in tokens:
		values = re.findall(r"'(.*?)'", token)
		game_data.append(values)

	return game_data

train_event_sentences = pd.read_csv('data/{}/{}_tuple_sentences.csv'.format(year, year))
train_articles = []
train_events = []
train_event_sentences_sample = train_event_sentences[:num_articles]
for index, row in train_event_sentences_sample.iterrows():
    train_events.append(str_to_tup(row['event']))
    train_articles.append(row['sentence'])

    

In [44]:
import re
article_size = 25
tokenized_articles = [] 
for article in train_articles:
    article = article.replace(',', ' ,').replace('.', ' . ')
    tokens = article.split(' ')
    i = 0
    while i<len(tokens):
        if len(tokens[i]) == 0:
            del tokens[i]
        else:
            i += 1
    end_point = min(article_size - 1, len(tokens))
    curr_article = []
    for i in range(end_point):
        curr_article.append(tokens[i])
    curr_article.append('_STOP_')
    for i in range(end_point + 1, article_size):
        curr_article.append('_PAD_')
    tokenized_articles.append(curr_article)
print(tokenized_articles[5])

['Love', 'had', '15', 'and', 'PLAYER_FIRST', 'PLAYER_LAST', 'STAT', 'in', 'his', 'debut', 'for', 'the', 'Cavs', ',', 'who', 'are', 'trying', 'to', 'blend', 'new', 'faces', 'and', 'big', 'egos', '_STOP_']


In [45]:
tokenizer_decoder = Tokenizer(char_level = False, filters= '', num_words=5000, oov_token='_RARE_')
tokenizer_decoder.fit_on_texts(tokenized_articles)
num_words = len(tokenizer_decoder.word_index) + 1
y = tokenizer_decoder.texts_to_sequences(tokenized_articles)
y_final = np.zeros((len(y), article_size, num_words))
print(y_final.shape)
for i in range(len(y)):

    for j in range(len(y[i])):
        
        y_final[i][j][y[i][j]] = 1.0


(20000, 25, 5259)


In [46]:
X = np.asarray(encodings)
X.shape

(20000, 13)

In [48]:
tmp = y_final.reshape(20000, -1)
tmp.shape

(20000, 131475)

In [10]:
# X = np.asarray(encodings)
# X_final = np.zeros((num_articles, article_size, vocab_size * max_count))
# print(X_final.shape)
# for i in range(num_articles):
#     for j in range(article_size):
#         X_final[i][j] = X[i]


(1000, 100, 5770)


In [52]:
model = Sequential()
model.add(Dense(10000, input_shape=(13,), activation='relu'))
#model.add(LSTM(1000, return_sequences=True))
model.add(Dense(131475, activation="softmax"))
# Compile & run training
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10000)             140000    
_________________________________________________________________
dense_2 (Dense)              (None, 131475)            1314881475
Total params: 1,315,021,475
Trainable params: 1,315,021,475
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.fit(X, tmp,
          batch_size=1000,
          epochs=10,
          validation_split=0.1)

Train on 18000 samples, validate on 2000 samples
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[1000,131475] and type bool on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[Node: training/RMSprop/gradients/loss/dense_2_loss/clip_by_value/Minimum_grad/LessEqual = LessEqual[T=DT_FLOAT, _class=["loc:@loss/dense_2_loss/clip_by_value/Minimum"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](loss/dense_2_loss/truediv, loss/dense_2_loss/sub)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'training/RMSprop/gradients/loss/dense_2_loss/clip_by_value/Minimum_grad/LessEqual', defined at:
  File "C:\Users\anshk\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\anshk\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\anshk\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\anshk\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\anshk\Anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\Users\anshk\Anaconda3\lib\asyncio\base_events.py", line 1432, in _run_once
    handle._run()
  File "C:\Users\anshk\Anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\Users\anshk\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\tornado\stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\anshk\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-54-0eae0ba0eba7>", line 4, in <module>
    validation_split=0.1)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in fit
    self._make_train_function()
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\engine\training.py", line 509, in _make_train_function
    loss=self.total_loss)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\optimizers.py", line 256, in get_updates
    grads = self.get_gradients(loss, params)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\optimizers.py", line 89, in get_gradients
    grads = K.gradients(loss, params)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2757, in gradients
    return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\gradients_impl.py", line 609, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\gradients_impl.py", line 375, in _MaybeCompile
    return grad_fn()  # Exit early
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\gradients_impl.py", line 609, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\math_grad.py", line 910, in _MinimumGrad
    return _MaximumMinimumGrad(op, grad, math_ops.less_equal)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\math_grad.py", line 892, in _MaximumMinimumGrad
    xmask = selector_op(x, y)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2492, in less_equal
    "LessEqual", x=x, y=y, name=name)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

...which was originally created as op 'loss/dense_2_loss/clip_by_value/Minimum', defined at:
  File "C:\Users\anshk\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 22 identical lines from previous traceback]
  File "C:\Users\anshk\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-52-14b5ac1748d9>", line 9, in <module>
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\engine\training.py", line 342, in compile
    sample_weight, mask)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\engine\training_utils.py", line 404, in weighted
    score_array = fn(y_true, y_pred)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\losses.py", line 69, in categorical_crossentropy
    return K.categorical_crossentropy(y_true, y_pred)
  File "C:\Users\anshk\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 3294, in categorical_crossentropy
    output = tf.clip_by_value(output, _epsilon, 1. - _epsilon)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\clip_ops.py", line 62, in clip_by_value
    t_min = math_ops.minimum(t, clip_value_max)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\ops\gen_math_ops.py", line 3023, in minimum
    "Minimum", x=x, y=y, name=name)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\Users\anshk\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1000,131475] and type bool on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[Node: training/RMSprop/gradients/loss/dense_2_loss/clip_by_value/Minimum_grad/LessEqual = LessEqual[T=DT_FLOAT, _class=["loc:@loss/dense_2_loss/clip_by_value/Minimum"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](loss/dense_2_loss/truediv, loss/dense_2_loss/sub)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [22]:
prediction = model.predict(X_final[1].reshape((1, 100, 5770)))
pred_words = [np.argmax(i) for i in prediction[0]]

index_word = {v: k for k, v in tokenizer_decoder.word_index.items()} 
article_pred = [index_word[i] for i in pred_words]
print(article_pred)

['the', 'had', 'scored', 'points', 'points', 'points', 'points', 'points', 'and', 'and', 'and', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
