In [4]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.set_printoptions(threshold=np.inf)

In [8]:
raw_data = pd.read_csv('../data/train.csv')
print(raw_data)

            GameId          PlayId  Team      X      Y     S     A   Dis  \
0       2017090700  20170907000118  away  73.91  34.84  1.69  1.13  0.40   
1       2017090700  20170907000118  away  74.67  32.64  0.42  1.35  0.01   
2       2017090700  20170907000118  away  74.00  33.20  1.22  0.59  0.31   
3       2017090700  20170907000118  away  71.46  27.70  0.42  0.54  0.02   
4       2017090700  20170907000118  away  69.32  35.42  1.82  2.43  0.16   
5       2017090700  20170907000118  away  75.06  24.00  1.01  0.32  0.18   
6       2017090700  20170907000118  away  74.11  16.64  1.11  0.83  0.02   
7       2017090700  20170907000118  away  73.37  18.73  1.24  0.74  0.13   
8       2017090700  20170907000118  away  56.63  26.90  0.26  1.86  0.28   
9       2017090700  20170907000118  away  73.35  38.83  4.55  0.76  0.51   
10      2017090700  20170907000118  away  74.15  28.90  0.72  0.73  0.01   
11      2017090700  20170907000118  home  75.82  17.56  2.30  1.39  0.55   
12      2017

In [9]:
print(np.unique(raw_data['PlayId'].values).size)
print(raw_data.columns)

23171
Index(['GameId', 'PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
       'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'Season', 'YardLine',
       'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
       'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'NflIdRusher', 'OffenseFormation', 'OffensePersonnel',
       'DefendersInTheBox', 'DefensePersonnel', 'PlayDirection', 'TimeHandoff',
       'TimeSnap', 'Yards', 'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate',
       'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection'],
      dtype='object')


In [12]:
raw_data['Temperature'] = raw_data['Temperature'].fillna(raw_data['Temperature'].mean())
raw_data['Humidity'] = raw_data['Humidity'].fillna(raw_data['Humidity'].mean())
print(raw_data['Temperature'])
print(raw_data['Humidity'])

0         63.0
1         63.0
2         63.0
3         63.0
4         63.0
5         63.0
6         63.0
7         63.0
8         63.0
9         63.0
10        63.0
11        63.0
12        63.0
13        63.0
14        63.0
15        63.0
16        63.0
17        63.0
18        63.0
19        63.0
20        63.0
21        63.0
22        63.0
23        63.0
24        63.0
25        63.0
26        63.0
27        63.0
28        63.0
29        63.0
          ... 
509732    45.0
509733    45.0
509734    45.0
509735    45.0
509736    45.0
509737    45.0
509738    45.0
509739    45.0
509740    45.0
509741    45.0
509742    45.0
509743    45.0
509744    45.0
509745    45.0
509746    45.0
509747    45.0
509748    45.0
509749    45.0
509750    45.0
509751    45.0
509752    45.0
509753    45.0
509754    45.0
509755    45.0
509756    45.0
509757    45.0
509758    45.0
509759    45.0
509760    45.0
509761    45.0
Name: Temperature, Length: 509762, dtype: float64
0         77.0
1         77.0
2    

In [15]:
def get_time(quarter, clock):
    split_time = clock.split(':')
    return (quarter-1)*15 + int(split_time[0]) + int(split_time[1])/60

def get_distance_to_touchdown(yard_line, possession_team, field_position):
    if possession_team != field_position:
        return yard_line
    else:
        return 100 - yard_line

def get_time_since_snap(time_handoff, time_snap):
    split_handoff = time_handoff.split(':')
    handoff_sec = int(split_handoff[1])*60 + int(split_handoff[2].split('.')[0])
    split_snap = time_handoff.split(':')
    snap_sec = int(split_snap[1])*60 + int(split_snap[2].split('.')[0])
    return float(handoff_sec) - float(snap_sec)

def get_height(player_height):
    split_height = player_height.split('-')
    return int(split_height[0])*12 + int(split_height[1])

def get_age(player_birth_date):
    return 2019 - int(player_birth_date.split('/')[2])

def encode_personnel(personnel):
    PERSONNELS = ['DB', 'DL', 'LB', 'OL', 'QB', 'RB', 'TE', 'WR']
    encoded_personnel = [0]*len(PERSONNELS)
    personnel = personnel.replace(' ','')
    for i in range(0,len(personnel),4):
        encoded_personnel[PERSONNELS.index(personnel[i+1:i+3])] += int(personnel[i])
    return encoded_personnel

def get_offense_features(formation, personnel):
    FORMATIONS = ['SHOTGUN','SINGLEBACK','JUMBO','PISTOL','I_FORM','ACE','WILDCAT','EMPTY']
    one_hot_formation = [int(f == formation) for f in FORMATIONS]
    return one_hot_formation + encode_personnel(personnel)

def get_defense_features(in_the_box, personnel):
    return [in_the_box] + encode_personnel(personnel)

In [16]:
nn_input = []
nn_target = []
for _,play in raw_data.groupby(['PlayId']):
    state_features = []
    state_features.append(get_distance_to_touchdown(play['YardLine'].iloc[0], play['PossessionTeam'].iloc[0], play['FieldPosition'].iloc[0]))
    state_features.append(get_time(play['Quarter'].iloc[0],play['GameClock'].iloc[0]))
    state_features.append(play['Down'].iloc[0])
    state_features.append(play['Distance'].iloc[0])
    state_features.append(get_time_since_snap(play['TimeHandoff'].iloc[0], play['TimeSnap'].iloc[0]))
    state_features.append(play['Temperature'].iloc[0])
    state_features.append(play['Humidity'].iloc[0])
    offense_features = get_offense_features(play['OffenseFormation'].iloc[0], play['OffensePersonnel'].iloc[0])
    defense_features = get_defense_features(play['DefendersInTheBox'].iloc[0], play['DefensePersonnel'].iloc[0])
    for t,team in play.groupby(['Team']):
        team_features = []
        team_features.append(np.mean(team['X']))
        team_features.append(np.mean(team['Y']))
        team_features.append(np.mean(team['S']))
        team_features.append(np.mean(team['A']))
        team_features.append(np.mean(team['Dis']))
        team_features.append(np.mean(team['Orientation']))
        team_features.append(np.mean(team['Dir']))
        team_features.append(np.mean(team['PlayerHeight'].apply(lambda x: get_height(x))))
        team_features.append(np.mean(team['PlayerWeight']))
        team_features.append(np.mean(team['PlayerBirthDate'].apply(lambda x: get_age(x))))
        if t == 'home':
            team_features.append(team['HomeScoreBeforePlay'].iloc[0])
            if team['PossessionTeam'].iloc[0] == team['HomeTeamAbbr'].iloc[0]:
                offense_features = offense_features + team_features
            else:
                defense_features = defense_features + team_features
        elif t == 'away':
            team_features.append(team['VisitorScoreBeforePlay'].iloc[0])
            if team['PossessionTeam'].iloc[0] == team['VisitorTeamAbbr'].iloc[0]:
                offense_features = offense_features + team_features
            else:
                defense_features = defense_features + team_features
    if np.amax(np.isnan(state_features + offense_features + defense_features)) == 0:
        nn_input.append(state_features + offense_features + defense_features)
        nn_target.append(play['Yards'].iloc[0])

In [19]:
#print(nn_input)
#print(nn_target)

In [20]:
nn_input = np.stack(nn_input)
nn_target = np.array(nn_target)

In [21]:
input_trn, input_val, target_trn, target_val = train_test_split(nn_input, nn_target, train_size=0.75)
input_ss = StandardScaler()
input_trn = input_ss.fit_transform(input_trn)
input_val = input_ss.transform(input_val)
target_ss = StandardScaler()
target_trn = target_ss.fit_transform(target_trn.reshape((-1,1)))
target_val = target_ss.transform(target_val.reshape((-1,1)))

model = Sequential()
model.add(Dense(units=64, activation='sigmoid', input_shape=(input_trn.shape[1:])))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=False)]







In [23]:
model.fit(x=input_trn, y=target_trn, epochs=1000, verbose=0, callbacks=callbacks, validation_data=(input_val,target_val))

<keras.callbacks.History at 0x2072841af60>