In [1]:
import numpy as np
import pandas as pd

from scipy.stats import beta
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from tqdm import tqdm


path = './data/'
output = './output/'
figs = './figs/'

player_ext = 'Players_2019.csv'
event_ext = 'Events_2019.csv'
teams_ext = 'Teams.csv'

df = pd.read_csv(path+event_ext)
players = pd.read_csv(path+player_ext)



In [2]:
def get_time_remain(events):
    events['GameID'] = events['Season'].astype(str) + events['DayNum'].astype(str) + events['WTeamID'].astype(str) + events['LTeamID'].astype(str)
    events['OT'] = (events['ElapsedSeconds'] - 2400)/300
    events['OT'] = np.where(events['OT'] < 0, 0, np.ceil(events['OT'])).astype(int)
    events['TimeRemain'] = 2400 + (events['OT'] * 300) - events['ElapsedSeconds']

    return events


df = get_time_remain(df)


In [4]:

def add_score_diff(events):
    
    events['WDiff'] = events['WPoints'] - events['LPoints']
    
    # relative to event
    events.loc[:,'EDiff'] = np.where(events['EventTeamID']==events['WTeamID'], events['WDiff'], -1*events['WDiff'])
    
    
    return events

df = add_score_diff(df)


In [6]:
print(df.EventType.unique())


['assist' 'made2_jump' 'turnover' 'made3_jump' 'foul_pers' 'miss2_jump'
 'reb_def' 'miss3_jump' 'reb_off' 'miss1_free' 'sub_in' 'made1_free'
 'sub_out' 'steal' 'block' 'made2_dunk' 'timeout_tv' 'made2_lay'
 'miss2_lay' 'reb_dead' 'timeout' 'miss2_dunk' 'made2_tip' 'miss2_tip'
 'foul_tech']


In [8]:
df.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WPoints,LPoints,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,GameID,OT,TimeRemain,WDiff,EDiff
0,30099809,2019,1,1104,1380,0,0,18,1380,657441,assist,2019111041380,0,2382,0,0
1,30099810,2019,1,1104,1380,0,2,18,1380,657447,made2_jump,2019111041380,0,2382,-2,2
2,30099811,2019,1,1104,1380,2,2,42,1104,653561,made2_jump,2019111041380,0,2358,0,0
3,30099812,2019,1,1104,1380,0,0,58,1380,657437,turnover,2019111041380,0,2342,0,0
4,30099813,2019,1,1104,1380,0,0,63,1104,653565,assist,2019111041380,0,2337,0,0


In [32]:
# identify pairs and triplets of free throws
# they don't necessarily have consecutive event ids

def get_pairs(events):
    
    events['TotalPoints'] = events['WPoints'] + events['LPoints']
    fts = ['made1_free','miss1_free']
    events['IsFT'] = np.where(events['EventType'].isin(fts), 'free_throw', events['EventType'])
    egb = events.groupby(['GameID','ElapsedSeconds','EventTeamID','IsFT'])['EventID'].count().reset_index()
    egb = egb.loc[(egb['IsFT']=='free_throw')&(egb['EventID']>1)]
    egb['PairOrTrip'] = 1
    egb.loc[egb['EventID']>3, 'PairOrTrip'] = 2
    egb = egb[['GameID','ElapsedSeconds','EventTeamID','PairOrTrip']]
    
    
    # separate ft df to keep event id information
    ft_df = events.loc[events['IsFT']=='free_throw']
    ft_df = ft_df[['EventID','GameID','ElapsedSeconds','EventTeamID','TotalPoints']]
    ft_df = ft_df.sort_values(by=['GameID','ElapsedSeconds','TotalPoints'])
    
    ft_df = pd.merge(ft_df, egb, how='left', left_on=['GameID','ElapsedSeconds','EventTeamID'], right_on=['GameID','ElapsedSeconds','EventTeamID'])

    # last deal is to separate out the 800 or so quadruple and quintuple sets of FTs
    doubles = ft_df.loc[ft_df['PairOrTrip']>1]
    
    egb = ft_df.groupby(['GameID','ElapsedSeconds','EventTeamID'])['EventID'].last().reset_index()
    doubles = doubles.groupby(['GameID','ElapsedSeconds','EventTeamID'])['EventID'].nth(-2).reset_index()
    
    egb = egb[['EventID']]
    doubles = doubles[['EventID']]
    
    print(len(egb))
    egb = pd.concat([egb,doubles], axis=0)
    print(len(egb))
    
    egb['FTPossEnd'] = 1
    
    events = pd.merge(events,egb,how='left',on=['EventID','EventID']) 
    
    events['FTPossEnd'] = events['FTPossEnd'].copy().fillna(0)
    return events
# treat made and missed FTs the same

df = get_pairs(df)

df.loc[df['IsFT']=='free_throw'].head(5)



117145
117425


Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WPoints,LPoints,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,GameID,OT,TimeRemain,WDiff,EDiff,IsFT,TotalPoints,FTPossEnd
18,30099827,2019,1,1104,1380,0,0,177,1104,653562,miss1_free,2019111041380,0,2223,0,0,free_throw,0,
22,30099831,2019,1,1104,1380,8,4,177,1104,653562,made1_free,2019111041380,0,2223,4,4,free_throw,12,1.0
103,30099912,2019,1,1104,1380,0,0,607,1380,657437,miss1_free,2019111041380,0,1793,0,0,free_throw,0,
104,30099913,2019,1,1104,1380,0,0,607,1380,657437,miss1_free,2019111041380,0,1793,0,0,free_throw,0,
105,30099914,2019,1,1104,1380,0,0,607,1380,657437,miss1_free,2019111041380,0,1793,0,0,free_throw,0,1.0
113,30099922,2019,1,1104,1380,0,0,673,1104,653561,miss1_free,2019111041380,0,1727,0,0,free_throw,0,
118,30099927,2019,1,1104,1380,21,18,673,1104,653561,made1_free,2019111041380,0,1727,3,3,free_throw,39,1.0
126,30099935,2019,1,1104,1380,0,0,703,1104,653561,miss1_free,2019111041380,0,1697,0,0,free_throw,0,
136,30099945,2019,1,1104,1380,24,18,703,1104,653561,made1_free,2019111041380,0,1697,6,6,free_throw,42,1.0
146,30099955,2019,1,1104,1380,25,18,730,1104,653565,made1_free,2019111041380,0,1670,7,7,free_throw,43,


In [7]:
# possessions either end on a field goal attempt, a free throw attempt, an offensive rebound, or a turnover
# so let's make a column denoting all of these

fgas = ['made2_jump','made3_jump','made2_dunk','made2_tip','made2_lay','miss2_jump','miss3_jump','miss2_dunk','miss2_tip','miss2_lay']
print(len(fgas))

# identify pairs and triplets of FTs by the same team - mark end of poss


def add_poss_markers(events):
    events['PossBegin'] = 0
    events['PossEnd'] = 0
#     events['PossEnder'] = 0
    
#     events['']
    
#     events['PossEnder'] = np.where(events['EventType'].isin(fgas),1,events['PossEnder'])
    
#     events['PossEnder'] = np.where(events['EventType'].isin(fgas),1,events['PossEnder'])
    
    return events

10
