In [1]:
import numpy as np
import pandas as pd
import gc
gc.collect()



path = './data/'
player_ext = 'Players_2019.csv'
event_ext = 'Events_2019.csv'

events = pd.read_csv(path+event_ext)

In [2]:
events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WPoints,LPoints,ElapsedSeconds,EventTeamID,EventPlayerID,EventType
0,30099809,2019,1,1104,1380,0,0,18,1380,657441,assist
1,30099810,2019,1,1104,1380,0,2,18,1380,657447,made2_jump
2,30099811,2019,1,1104,1380,2,2,42,1104,653561,made2_jump
3,30099812,2019,1,1104,1380,0,0,58,1380,657437,turnover
4,30099813,2019,1,1104,1380,0,0,63,1104,653565,assist


In [3]:
len(events)

2543316

In [4]:
list(events)

['EventID',
 'Season',
 'DayNum',
 'WTeamID',
 'LTeamID',
 'WPoints',
 'LPoints',
 'ElapsedSeconds',
 'EventTeamID',
 'EventPlayerID',
 'EventType']

### Validation

In [5]:
# beautiful
events.isna().sum()

EventID           0
Season            0
DayNum            0
WTeamID           0
LTeamID           0
WPoints           0
LPoints           0
ElapsedSeconds    0
EventTeamID       0
EventPlayerID     0
EventType         0
dtype: int64

### Feature Engineering

To add:
- game id
- current OT

Need: 
- possession time
- time remaining in game (including factoring OT)
- change of possession indicator

In [6]:
# load compact results
# cr_path = './data/RegularSeasonCompactResults.csv'
# cr = pd.read_csv(cr_path)

# cr_path = './data/Prelim2019_RegularSeasonCompactResults.csv'
# cr = pd.read_csv(cr_path)



In [7]:
events['GameID'] = events['Season'].astype(str) + events['DayNum'].astype(str) + events['WTeamID'].astype(str) + events['LTeamID'].astype(str)
events['OT'] = (events['ElapsedSeconds'] - 2400)/300
events['OT'] = np.where(events['OT'] < 0, 0, np.ceil(events['OT'])).astype(int)


In [8]:
events['TimeRemain'] = 2400 + (events['OT'] * 300) - events['ElapsedSeconds']


In [9]:
events.groupby(['EventType'])['EventID'].count()


EventType
assist        143827
block          35855
foul_pers     195974
foul_tech        727
made1_free    146907
made2_dunk     19173
made2_jump     58823
made2_lay     112392
made2_tip       2643
made3_jump     83688
miss1_free     61075
miss2_dunk      2517
miss2_jump    102592
miss2_lay      86044
miss2_tip       1144
miss3_jump    159586
reb_dead       30928
reb_def       273589
reb_off       108474
steal          67640
sub_in        327515
sub_out       326809
timeout        29704
timeout_tv     24098
turnover      141592
Name: EventID, dtype: int64

In [10]:
sample = events.loc[events['GameID'] == '201913214631217']
sample.to_csv('./output/sample_game.csv',index=False)



Change of possession calculation is complicated. It's needed to know how each possession ends (did the team make one free throw or two)?

There are some event types that may or may not end the possession, like a made free throw.

I plan to resolve some of these in different ways.
If a team misses their last free throw, then there should be a rebound coupled with it.
Therefore if there is no rebound event coupled with it, they must have made their last free throw (unless on an air ball? but that's an edge case i'll ignore).

In [11]:
all_events = list(events.EventType.unique())

# offense at time of event
off_events = ['assist','made2_jump','turnover','made3_jump','miss2_jump','miss3_jump','reb_off','miss1_free',
             'made1_free','made2_dunk','made2_lay','miss2_lay','reb_dead','miss2_dunk','made2_tip','miss2_tip']

def_events = ['block','steal','reb_def']

neu_events = ['timeout_tv','foul_tech','foul_pers','timeout','sub_out','sub_in']

events['OtherTeam'] = np.where(events['EventTeamID']==events['WTeamID'],events['LTeamID'],events['WTeamID'])
events['PossTeam'] = np.where(events['EventType'].isin(off_events), events['EventTeamID'], 0)
events['PossTeam'] = np.where(events['EventType'].isin(def_events), events['OtherTeam'], events['PossTeam'])



# need to label whether
# events['CofP'] = np.where(events['EventType'].isin('cop'))


# blocks might or might not lead to cop


# made free throws may or may not lead to cop

In [13]:
events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WPoints,LPoints,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,GameID,OT,TimeRemain,OtherTeam,PossTeam
0,30099809,2019,1,1104,1380,0,0,18,1380,657441,assist,2019111041380,0,2382,1104,1380
1,30099810,2019,1,1104,1380,0,2,18,1380,657447,made2_jump,2019111041380,0,2382,1104,1380
2,30099811,2019,1,1104,1380,2,2,42,1104,653561,made2_jump,2019111041380,0,2358,1380,1104
3,30099812,2019,1,1104,1380,0,0,58,1380,657437,turnover,2019111041380,0,2342,1104,1380
4,30099813,2019,1,1104,1380,0,0,63,1104,653565,assist,2019111041380,0,2337,1380,1104


In [14]:
sample = events.loc[events['GameID'] == '2019111041380']
sample.to_csv('./output/sample_game.csv',index=False)

