In [17]:
from datetime import datetime, timedelta
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from score_drives import get_drive_stats

In [2]:
df = get_drive_stats(2009, 2019, exclude_playoffs=False,
                     exclude_blowouts=False, dpa_scores=False,
                     opponent_adjustment=False,
                     data_path='/home/jovyan/work/data')

2010122604.json '2010122604'
min() arg is an empty sequence dict_keys([])


In [3]:
df.head()

Unnamed: 0,away_final_score,away_team,defensive_team,drive_time,end_quarter,end_time,first_play_desc,game_id,home_final_score,home_score_diff_last_quarter,...,offensive_win,defensive_win,tie,1_quarter_time,2_quarter_time,3_quarter_time,4_quarter_time,5_quarter_time,offense_home,defense_home
1552,10,TEN,TEN,1.733333,1,13.266667,R.Bironas kicks 67 yards from TEN 30 to PIT 3....,2009091000,13,0,...,1,0,0,1.733333,0.0,0.0,0.0,0.0,True,False
1553,10,TEN,PIT,1.866667,1,11.4,(13:16) C.Johnson up the middle to TEN 2 for n...,2009091000,13,0,...,0,1,0,1.866667,0.0,0.0,0.0,0.0,False,True
1554,10,TEN,TEN,3.066667,1,8.333333,(11:24) B.Roethlisberger pass short right to M...,2009091000,13,0,...,1,0,0,3.066667,0.0,0.0,0.0,0.0,True,False
1555,10,TEN,PIT,1.6,1,6.733333,(8:20) C.Johnson right end to TEN 43 for 32 ya...,2009091000,13,0,...,0,1,0,1.6,0.0,0.0,0.0,0.0,False,True
1556,10,TEN,TEN,1.916667,1,4.816667,(6:44) R.Mendenhall up the middle to PIT 28 fo...,2009091000,13,0,...,1,0,0,1.916667,0.0,0.0,0.0,0.0,True,False


In [4]:
df.columns

Index(['away_final_score', 'away_team', 'defensive_team', 'drive_time',
       'end_quarter', 'end_time', 'first_play_desc', 'game_id',
       'home_final_score', 'home_score_diff_last_quarter', 'home_team',
       'last_play_desc', 'n_plays', 'offensive_team', 'penalty_yards',
       'result', 'start_quarter', 'start_time', 'start_yard_line',
       'yards_gained', 'drive_id', 'season', 'game_date', 'day_of_week',
       'unique_game_flag', 'game_in_season', 'is_playoffs', 'expected_points',
       'offensive_points', 'is_touchdown', 'is_field_goal', 'is_score',
       'is_interception', 'is_fumble', 'dst_points', 'home_points',
       'away_points', 'offensive_team_score_end', 'defensive_team_score_end',
       'offensive_team_score_start', 'defensive_team_score_start',
       'home_score_end', 'away_score_end', 'home_score_start',
       'away_score_start', 'offensive_final_score', 'defensive_final_score',
       'offensive_win', 'defensive_win', 'tie', '1_quarter_time',
       '2_q

In [18]:
print(df.loc[df['season'] == 2009, 'game_id'].values[0])
print(df.loc[df['season'] == 2010, 'game_id'].values[0])

2009091000
2010090900


In [27]:
week_start = datetime.strptime('20090910', '%Y%m%d')
print(week_start.weekday())
week_end = week_start + timedelta(days=4)
print(week_end.weekday())

3
0


In [98]:
def training_features_pipeline(df, games_back=256):
    # Initialize beginning values.
    week_start = datetime.strptime('20100909', '%Y%m%d') # Thursday of first week in train
    week_end = week_start + timedelta(days=4) # Monday of first week in train
    current_season = 2010 # Season of first week in train
    while week_end <= datetime.now():
        slate = get_train_slate(df, week_start, week_end)
        ddf = get_drive_data(df, current_season, week_start, games_back)
        print(slate)
        print(ddf.head())
        break
        
        
def get_train_slate(df, week_start, week_end):
    slate_list = []
    sdf = df.loc[
        (df['game_date'] >= week_start) & 
        (df['game_date'] <= week_end)
    ].copy()
    columns = [
        'game_id', 'home_team', 'away_team', 'home_final_score',
        'away_final_score'
    ]
    for row in sdf[columns].values:
        slate_list.append(tuple(row.tolist()))
    return set(slate_list)
    
    
def get_drive_data(df, current_season, week_start, games_back):
    start_this_season, end_this_season = get_start_end_this_season(
        df, current_season, week_start, games_back
    )
    start_last_season = get_start_last_season(
        df, current_season, games_back, end_this_season
    )
    this_season_mask = (
        (df['season'] == current_season) & 
        (df['game_in_season'] >= end_this_season) &
        (df['game_in_season'] <= start_this_season)
    )
    last_season_mask = (
        (df['season'] == current_season - 1) &
        (df['game_in_season'] >= start_last_season)
    )
    ddf = df.loc[(this_season_mask) | (last_season_mask)].copy()
    return ddf
    
        
def get_start_end_this_season(df, current_season, week_start, games_back):
    season_mask = df['season'] == current_season
    date_mask = df['game_date'] < week_start
    season_drives = df.loc[(season_mask) & (date_mask), 'game_id'].copy()
    n_this_season = season_drives.nunique()
    print(n_this_season)
    if n_this_season >= games_back:
        return games_back - n_this_season, n_this_season
    else:
        return 0, n_this_season


def get_start_last_season(df, current_season, games_back, n_this_season):
    if n_this_season >= games_back:
        return 268
    n_last_season = games_back - n_this_season + 11
    game_in_season = 267 - n_last_season + 1
    season_mask = df['season'] == current_season - 1
    game_in_season_mask = df['game_in_season'] == game_in_season
    print(game_in_season)
    game_date = df.loc[(season_mask) & (game_in_season_mask), 'game_date'].min()
    print(game_date)
    week_start = get_week_start(game_date)
    date_mask = df['game_date'] >= week_start
    in_season_start = df.loc[(season_mask) & (date_mask), 'game_in_season'].min()
    return in_season_start
    
    
def get_week_start(game_date):
    weekday = game_date.weekday()
    print(weekday)
    if weekday >= 3:
        td = weekday - 3
    else:
        td = 7 + weekday - 3
    return game_date - timedelta(days=td)

In [99]:
training_features_pipeline(df, games_back=256)

0
1
2009-09-10 00:00:00
3
{('2010091209', 'PIT', 'ATL', 15, 9), ('2010091203', 'HOU', 'IND', 34, 24), ('2010091200', 'TB', 'CLE', 17, 14), ('2010090900', 'NO', 'MIN', 14, 9), ('2010091201', 'BUF', 'MIA', 10, 15), ('2010091207', 'CHI', 'DET', 19, 14), ('2010091208', 'LA', 'ARI', 13, 17), ('2010091212', 'WAS', 'DAL', 13, 7), ('2010091210', 'PHI', 'GB', 20, 27), ('2010091205', 'TEN', 'OAK', 38, 13), ('2010091206', 'NYG', 'CAR', 31, 18), ('2010091300', 'NYJ', 'BAL', 9, 10), ('2010091204', 'JAX', 'DEN', 24, 17), ('2010091301', 'KC', 'LAC', 21, 14), ('2010091211', 'SEA', 'SF', 31, 6), ('2010091202', 'NE', 'CIN', 38, 24)}
      away_final_score away_team defensive_team  drive_time  end_quarter  \
1552                10       TEN            TEN    1.733333            1   
1553                10       TEN            PIT    1.866667            1   
1554                10       TEN            TEN    3.066667            1   
1555                10       TEN            PIT    1.600000            1 

In [64]:
df['game_in_season'].min()

1