In [1]:
import numpy as np 
import pandas as pd
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Conv1D, GRU
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.metrics import categorical_accuracy
from keras.regularizers import l1_l2, l2
from keras import backend as K
import tensorflow as tf
import nltk

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
train_bs = pd.read_csv('data/2017/2017_tuple_space.csv')
train_art = pd.read_csv('data/2017/2017_article_box.csv')
# test_df = pd.read_csv('test.csv')

maxlen_seq_bs = 333
maxlen_seq_art = 1500

# Loading and converting the inputs to trigrams
train_input_seqs = train_bs['tuples'].values.T
train_target_seqs = train_art['article'].values.T

# # Same for test
# test_input_seqs = test_df['input'].values.T
# test_input_grams = seq2ngrams(test_input_seqs)

In [4]:
# Initializing and defining the tokenizer encoders and decoders based on the train set
tokenizer_encoder = Tokenizer(char_level=False)
tokenizer_encoder.fit_on_texts(train_input_seqs)
tokenizer_decoder = Tokenizer(char_level = False, num_words=5000)
tokenizer_decoder.fit_on_texts(train_target_seqs)

# Using the tokenizer to encode and decode the sequences for use in training
# Inputs
train_input_data = tokenizer_encoder.texts_to_sequences(train_input_seqs)
train_input_data = sequence.pad_sequences(train_input_data, maxlen = maxlen_seq_bs, padding = 'post')

# Targets
train_target_data = tokenizer_decoder.texts_to_sequences(train_target_seqs)
train_target_data = sequence.pad_sequences(train_target_data, maxlen = maxlen_seq_art, padding = 'post')
train_target_data = to_categorical(train_target_data)

# Use the same tokenizer defined on train for tokenization of test
# test_input_data = tokenizer_encoder.texts_to_sequences(test_input_grams)
# test_input_data = sequence.pad_sequences(test_input_data, maxlen = maxlen_seq, padding = 'post')

# Computing the number of words and number of tags to be passed as parameters to the keras model
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1

input = Input(shape = (maxlen_seq_bs,))

In [6]:
tokenizer_decoder.word_index

{'the': 1,
 'a': 2,
 'and': 3,
 'to': 4,
 'in': 5,
 'of': 6,
 'on': 7,
 'with': 8,
 'points': 9,
 'for': 10,
 'said': 11,
 'was': 12,
 'game': 13,
 'his': 14,
 'had': 15,
 'he': 16,
 'it': 17,
 'we': 18,
 'that': 19,
 'at': 20,
 'i': 21,
 'first': 22,
 '3': 23,
 'but': 24,
 'from': 25,
 'quarter': 26,
 'season': 27,
 'after': 28,
 'scored': 29,
 'up': 30,
 'just': 31,
 'night': 32,
 'as': 33,
 'games': 34,
 'by': 35,
 'out': 36,
 'have': 37,
 'they': 38,
 'second': 39,
 'left': 40,
 'their': 41,
 'is': 42,
 'rebounds': 43,
 'who': 44,
 'this': 45,
 'two': 46,
 'back': 47,
 'lead': 48,
 '10': 49,
 'point': 50,
 'play': 51,
 'coach': 52,
 'three': 53,
 'got': 54,
 'when': 55,
 'were': 56,
 'made': 57,
 'more': 58,
 'all': 59,
 'team': 60,
 'third': 61,
 'fourth': 62,
 'half': 63,
 'an': 64,
 'one': 65,
 'be': 66,
 'time': 67,
 'four': 68,
 'straight': 69,
 'has': 70,
 'not': 71,
 '11': 72,
 'win': 73,
 "it's": 74,
 'nba': 75,
 'against': 76,
 'last': 77,
 'get': 78,
 'minutes': 79,
 'fiv

In [2]:
train_articles = pd.read_csv('data/2017/2017_articles.csv')
box_scores = pd.read_csv('data/2017/2017_article_box.csv')
event_space = pd.read_csv('data/2017/2017_tuple_space.csv')

team_map = {'ATL' : 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 
			'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers',
			'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIA': 'Miami Heat',
			'MIL': 'Milwaukee Bucks', 'NYK': 'New York Knicks', 'ORL': 'Orlando Magic',
			'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors', 'WAS': 'Washington Wizards',
			'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'GSW': 'Golden State Warriors',
			'HOU': 'Houston Rockets', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers',
			'MEM': 'Memphis Grizzlies', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
			'OKC': 'Oklahoma City Thunder', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers',
			'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'UTA': 'Utah Jazz',
			'NJN': 'New Jersey Nets', 'SEA': 'Seattle SuperSonics', 'CHA': 'Charlotte Bobcats'}

inv_team_map = {v: k for k, v in team_map.items()}

In [51]:
import re, csv, operator
from word2number import w2n
from preprocess import str_to_tup, get_tuples

articles = train_articles['article']

#year = '2017'
years = []
for i in range(2017, 2018):
    years.append(str(i))
    

for year in years:
    file_write = 'data/{}/{}_labeled_events.csv'.format(year, year)
    outFile = open(file_write, 'w')
    label_writer = csv.writer(outFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    label_writer.writerow(['event', 'label'])
    for idx in range(len(articles)):
        if idx % 100 == 0:
            print(year, idx)
        
        article = articles[idx]
        article = article.replace('-', ' ')
        capital_words = re.findall('([A-Z][a-z]+)', article)

        data = box_scores['box_score'][idx]
        bs_data = str_to_tup(data)
        names = set()
        for record in bs_data:
            if len(record) == 15:
                names.add(record[0])

        team_map_values = [v for k, v in team_map.items()]

        names_present = set()
        for word in capital_words:
            for full_name in names:
                player = full_name.split(',')
                if word in player:
                    if len(player) > 1:
                        names_present.add((full_name, word))
            for team_name in team_map_values:
                team_name_entities = team_name.split(" ")
                if word in team_name_entities:
                    names_present.add((inv_team_map[team_name], word))
                                
        sentences = set()
        for name in names_present:
            for match in re.finditer(name[1], article):
                start = match.start()
                sent_end = start
                while True:
                    sent_end += 1
                    if article[sent_end:sent_end+2] == '. ' or sent_end > len(article) - 1:
                        break

                sent_begin = start
                while True:
                    sent_begin -= 1
                    if article[sent_begin] == '.' or sent_begin == 0:
                        break

                sentences.add((name[0], article[sent_begin+2:sent_end+1]))
                
        all_events = []
        for (name, s) in sentences:
            sent_words = s.split()
            for word in sent_words:
                if word.isdigit():
                    all_events.append((name, int(word), s))
                else:
                    try:
                        number = w2n.word_to_num(word)
                        split = s.split(" ")
                        i = split.index(word)
                        if "scored" in split or "points" in split:
                            all_events.append((name, number, s))
                    except:
                        continue

        game_event_space = str_to_tup(event_space['tuples'][idx])
        game_event_space = [tuple(i) for i in game_event_space]

        for record in game_event_space:
            for event in all_events:
                found_event = False
                if record[0] == event[0] and str(event[1]) == record[2]:
                    found_event = True
                    should_write = parse_event_by_sentence(record[0], record[1], record[2], event[2])
                    if should_write:
                        label_writer.writerow((record, 1))
            if not found_event:
                random = int(3 * np.random.uniform(0.0, 1.0, 1))

                if random == 0:
                    label_writer.writerow((record, 0))

    outFile.close()

2017 0
2017 100
2017 200
2017 300
2017 400
2017 500
2017 600
2017 700
2017 800
2017 900
2017 1000
2017 1100
2017 1200


In [3]:
def parse_event_by_sentence(name, cat, value, sent):
    if cat == 'ast' and 'assists' not in sent:
        return False
    if cat == 'stl' and 'steals' not in sent:
        return False
    if (cat == 'orb' or cat == 'trb') and ('rebounds' not in sent and 'boards' not in sent):
        return False
    if cat == 'tov' and 'turnovers' not in sent:
        return False
    if cat == 'blk' and 'blocks' not in sent:
        return False
    return True

In [9]:
import re, csv, operator
from word2number import w2n
from preprocess import str_to_tup, get_tuples

articles = train_articles['article']

#year = '2017'
years = []
for i in range(2017, 2018):
    years.append(str(i))
    

for year in years:
    file_write = 'data/{}/{}_tuple_sentences.csv'.format(year, year)
    outFile = open(file_write, 'w', newline='')
    label_writer = csv.writer(outFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    label_writer.writerow(['event', 'sentence'])
    for idx in range(len(articles)):
        if idx % 100 == 0:
            print(year, idx)
        
        article = articles[idx]
        article = article.replace('-', ' ')
        capital_words = re.findall('([A-Z][a-z]+)', article)

        data = box_scores['box_score'][idx]
        bs_data = str_to_tup(data)
        names = set()
        for record in bs_data:
            if len(record) == 15:
                names.add(record[0])

        team_map_values = [v for k, v in team_map.items()]

        names_present = set()
        for word in capital_words:
            for full_name in names:
                player = full_name.split(',')
                if word in player:
                    if len(player) > 1:
                        names_present.add((full_name, word))
            for team_name in team_map_values:
                team_name_entities = team_name.split(" ")
                if word in team_name_entities:
                    names_present.add((inv_team_map[team_name], word))
                                
        sentences = set()
        for name in names_present:
            for match in re.finditer(name[1], article):
                start = match.start()
                sent_end = start
                while True:
                    sent_end += 1
                    if article[sent_end:sent_end+2] == '. ' or sent_end > len(article) - 1:
                        break

                sent_begin = start
                while True:
                    sent_begin -= 1
                    if article[sent_begin] == '.' or sent_begin == 0:
                        break

                sentences.add((name[0], article[sent_begin+2:sent_end+1]))
                
        all_events = []
        for (name, s) in sentences:
            sent_words = s.split()
            for word in sent_words:
                if word.isdigit():
                    all_events.append((name, int(word), s))
                else:
                    try:
                        number = w2n.word_to_num(word)
                        split = s.split(" ")
                        i = split.index(word)
                        if "scored" in split or "points" in split:
                            all_events.append((name, number, s))
                    except:
                        continue

        game_event_space = str_to_tup(event_space['tuples'][idx])
        game_event_space = [tuple(i) for i in game_event_space]
        
        home_team = event_space['home_team'][idx]
        away_team = event_space['away_team'][idx]

        all_records = set()
        all_sent = set()
        for event in all_events:
            for record in game_event_space:
                if record[0] == event[0] and str(event[1]) == record[2]:
                    should_write = parse_event_by_sentence(record[0], record[1], record[2], event[2])
                    if should_write:
                        try:
                            label_writer.writerow((record, event[2]))
                        except UnicodeEncodeError:
                            print(record, event[2])
        #label_writer.writerow((home_team, away_team, all_records, ''.join(all_sent)))
    outFile.close()

2017 0
2017 100
2017 200
2017 300
2017 400
2017 500
2017 600
2017 700
2017 800
('Davis,Anthony', 'fg3', '3') W YORK    ��� Anthony Davis knows how much the Brooklyn Nets depend on 3 pointers.
2017 900
2017 1000
2017 1100
2017 1200
