# Sports Features

This notebook reads Red Sox, Celtics, and Bruins home game information and engineers features from them.

## Setup

In [1]:
# Libraries.
import datetime as dt
import numpy as np
import pandas as pd

In [2]:
# Library settings.

# Pandas.
pd.set_option('display.max_columns', 1000)

In [3]:
# Other miscellanea.

# Abbreviations for teams.
teams = ['bruins','celtics','sox']

# Approximate game lengths based on a cursory web search of typical MLB/NBA/NHL lengths.
game_lengths = {
    'bruins':  2.3333,
    'celtics': 2.25,
    'sox': 3+8./60.
}

# Weekday abbreviations.
weekday_abbreviations = 'mon,tue,wed,thu,fri,sat,sun'.split(',')

# Cutoff for early/late game times.
late_game_start = dt.time(18)

## Data

In [24]:
# Read Bruins home games.
bruins = pd.DataFrame.from_csv("../../../data/bruins/home.csv").reset_index(drop=False)

# Drop home column.
bruins.drop(['Home'], axis=1, inplace=True)

# Rename other columns to make joining easy.
bruins.rename(columns={'Datetime':'bruins_datetime', 'Opponent':'bruins_opponent', 'Playoff': 'bruins_playoff'}, inplace=True)

# Get standalone game date, time.
bruins['date'] = pd.DatetimeIndex(bruins.bruins_datetime).date
bruins['bruins_time'] = pd.DatetimeIndex(bruins.bruins_datetime).time

# Drop datetime.
bruins.drop(['bruins_datetime'], axis=1, inplace=True)

In [None]:
bruins.head()

In [26]:
# Read Celtics home games.
celtics = pd.DataFrame.from_csv("../../../data/celtics/home.csv").reset_index(drop=False)

# Drop home/playoff columns.
celtics.drop(['Home'], axis=1, inplace=True)

# Rename other columns to make joining easy.
celtics.rename(columns={'Datetime':'celtics_datetime', 'Opponent':'celtics_opponent', 'Playoff': 'celtics_playoff'}, inplace=True)

# Get standalone game date, time.
celtics['date'] = pd.DatetimeIndex(celtics.celtics_datetime).date
celtics['celtics_time'] = pd.DatetimeIndex(celtics.celtics_datetime).time

# Drop datetime.
celtics.drop(['celtics_datetime'], axis=1, inplace=True)

In [None]:
celtics.head()

In [32]:
# Read Red Sox home games.
sox = pd.DataFrame.from_csv("../../../data/sox_master.csv").reset_index(drop=True)

# Rename other columns to make joining easy.
sox.rename(columns={'START_DATE':'date', 'START_TIME':'sox_time', 'OPPONENT':'sox_opponent'}, inplace=True)

# Drop 0s from start date.
sox.date = sox.date.str.extract('(\d+-\d+-\d+)')
# Column for start datetime.
sox['sox_datetime'] = pd.to_datetime(sox.date + ' ' + sox.sox_time)

# Add a "rounded" game time to make grouping easier.
sox['sox_time_rounded'] = sox.sox_time.str.replace(':[012][05] ',':00 ').str.replace(':[345][05] ',':30 ')
sox['sox_datetime_rounded'] = pd.to_datetime(sox.date + ' ' + sox.sox_time_rounded)

# Turn start date into a date.
sox.date = pd.DatetimeIndex(sox.date).date

# Convert time fields to times.
sox.sox_time = pd.DatetimeIndex(sox.sox_datetime).time
sox.sox_time_rounded = pd.DatetimeIndex(sox.sox_datetime_rounded).time

# Drop datetime.
sox.drop(['sox_datetime','sox_datetime_rounded'], axis=1, inplace=True)

In [None]:
sox.head()

## Merge

In [35]:
# Get game dates.
game_dates = sorted(list(set(bruins.date.append(celtics.date).append(sox.date))))

# Start DF.
games = pd.DataFrame({'date':game_dates})

# Basic weekday column.
games['weekday'] = pd.DatetimeIndex(games.date).weekday

# Merge to each team.
games = (
    games.merge(bruins,  on='date', how='left')
         .merge(celtics, on='date', how='left')
         .merge(sox,     on='date', how='left')
)

# Pre-feature-engineering column count.
pre_feature_engineering_d = games.shape[1]

In [None]:
games.head()

## Features

Derive binary features here. Keep as booleans for now; will turn into 0/1 ints later.

In [37]:
# Simple team game features.
for team in teams:
    games[team+'_game'] = games[team+'_opponent'].isnull() == False

In [38]:
# Day of week features.
# These include one-hot weekday columns and special groupings: M-Th, M-F, F-Sa.
for team in teams:
    for d in xrange(7):
        games[team+'_day_'+weekday_abbreviations[d]] = games[team+'_game'] & (games.weekday == d)
    games[team+'_day_mon_thu'] = games[team+'_game'] & (games.weekday <= 3)
    games[team+'_day_mon_fri'] = games[team+'_game'] & (games.weekday <= 4)
    games[team+'_day_fri_sat'] = games[team+'_game'] & (games.weekday.isin([4,5]))

In [39]:
# Early/late features.
for team in teams:
    games[team+'_time_early'] = games[team+'_time'] <  late_game_start
    games[team+'_time_late']  = games[team+'_time'] >= late_game_start

In [40]:
# Game day-time intersection features.

# Suffixes for game day group features.
game_day_group_suffixes  = ['fri', 'sat', 'sun', 'mon_thu', 'mon_fri', 'fri_sat']
game_time_group_suffixes = ['early','late']

# Container for useful game feature columns.
game_cols = []

# One set per team.
for team in teams:
    day_group_cols  = [team+'_'+s for s in game_day_group_suffixes]
    time_group_cols = [team+'_game_'+s for s in game_time_group_suffixes]
    for day_suffix in game_day_group_suffixes:
        for time_suffix in game_time_group_suffixes:
            # Calculate each column.
            col_name = team+'_'+day_suffix+'_'+time_suffix
            games[col_name] = games[team+'_day_'+day_suffix] & games[team+'_time_'+time_suffix]
            # Save these columns too. We're interseted in using them as final features.
            game_cols.append(col_name)

In [41]:
# Prune game columns to remove overlaps.
game_cols = [col for col in game_cols if ('_fri_early' not in col and '_fri_late' not in col)]

All features are derived at this point. Make 0/1 binary.

In [42]:
for col in games.columns[pre_feature_engineering_d:]:
    games[col] = games[col].astype(np.int)

In [None]:
games

## Export

In [46]:
games.to_csv('../../../data/game_days.csv', index=False)

## Exploration

Verify viability of specific features.

In [20]:
# TODO