In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime
from kaggle.competitions import nflrush
import tqdm
import re
from string import punctuation
import lightgbm as lgb
from sklearn.model_selection import train_test_split

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [15,10]

# Utils

In [3]:
import pandas as pd
import numpy as np
from string import punctuation
import datetime
import re

def crps(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred), axis=1)

def yard_to_cdf(yard):
    yard = np.round(yard).astype(int)
    indices = yard+99
    cdfs = np.zeros((yard.shape[0], 199))
    for i in range(len(cdfs)):
        cdfs[i, indices[i]:] = 1
    return cdfs

def cdf_to_yard(cdf):
    yard_index = (cdf==1).argmax(axis=1)
    yard = yard_index-99
    return yard

def clean_StadiumType(txt):
    if pd.isna(txt):
        return np.nan
    txt = txt.lower()
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = re.sub(' +', ' ', txt)
    txt = txt.strip()
    txt = txt.replace('outside', 'outdoor')
    txt = txt.replace('outdor', 'outdoor')
    txt = txt.replace('outddors', 'outdoor')
    txt = txt.replace('outdoors', 'outdoor')
    txt = txt.replace('oudoor', 'outdoor')
    txt = txt.replace('indoors', 'indoor')
    txt = txt.replace('ourdoor', 'outdoor')
    txt = txt.replace('retractable', 'rtr.')
    return txt

def transform_StadiumType(txt):
    if pd.isna(txt):
        return np.nan
    if 'outdoor' in txt or 'open' in txt:
        return 1
    if 'indoor' in txt or 'closed' in txt:
        return 0
    return np.nan

def str_to_seconds(txt):
    txt = txt.split(':')
    ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
    return ans

def str_to_float(txt):
    try:
        return float(txt)
    except Exception as e:
        return np.NaN

def map_weather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans*=0.5
    if 'climate controlled' in txt or 'indoor' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt:
        return -3*ans
    return 0

def new_orientation(angle, play_direction):
    if play_direction == 0:
        new_angle = 360.0 - angle
        if new_angle == 360.0:
            new_angle = 0.0
        return new_angle
    else:
        return angle

def preprocess_features(df):
    """Accepts df like train data, returns X, y"""

    # Feature engineering
    df['DefendersInTheBox_vs_Distance'] = (df['DefendersInTheBox'] / df['Distance'])
    df['StadiumType'] = df['StadiumType'].apply(clean_StadiumType)
    df['StadiumTypeShort'] = df['StadiumType'].apply(transform_StadiumType)
    df = df.drop(['StadiumType'], axis=1)

    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
        'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
        'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
        'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
        'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 

    turf_type = df['Turf'].map(Turf)
    df['TurfIsNatural'] = (turf_type == 'Natural')
    df = df.drop(['Turf'], axis=1)

    map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
    for abb in df['PossessionTeam'].unique():
        map_abbr[abb] = abb
    df['PossessionTeam'] = df['PossessionTeam'].map(map_abbr)
    df['HomeTeamAbbr'] = df['HomeTeamAbbr'].map(map_abbr)
    df['VisitorTeamAbbr'] = df['VisitorTeamAbbr'].map(map_abbr)

    df['HomePossesion'] = (df['PossessionTeam'] == df['HomeTeamAbbr'])

    df['Field_eq_Possession'] = df['FieldPosition'] == df['PossessionTeam']
    df['HomeField'] = df['FieldPosition'] == df['HomeTeamAbbr']

    df = pd.concat([df.drop(['OffenseFormation'], axis=1), pd.get_dummies(df['OffenseFormation'], prefix='Formation')], axis=1)
    
    df['GameClock'] = df['GameClock'].apply(str_to_seconds)

    df['PlayerHeight'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    df['PlayerBMI'] = 703*(df['PlayerWeight']/(df['PlayerHeight'])**2)

    df['TimeHandoff'] = pd.to_datetime(df['TimeHandoff'], utc=True)
    df['TimeSnap'] = pd.to_datetime(df['TimeSnap'], utc=True)
    df['TimeDelta'] = (df['TimeHandoff']-df['TimeSnap']).apply(lambda x: x.total_seconds())
    df['PlayerBirthDate'] = df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
    df['PlayerBirthDate'] = pd.to_datetime(df['PlayerBirthDate'], utc=True)

    seconds_in_year = 60*60*24*365.25
    df['PlayerAge'] = (df['TimeHandoff']-df['PlayerBirthDate']).apply(lambda x: x.total_seconds())/seconds_in_year
    df = df.drop(['TimeHandoff', 'TimeSnap', 'PlayerBirthDate'], axis=1)

    df['WindSpeed'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    df['WindSpeed'] = df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    df['WindSpeed'] = df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    df['WindSpeed'] = df['WindSpeed'].apply(str_to_float)
    df['WindSpeed'] = df['WindSpeed'].fillna(method='pad')

    df = df.drop(['WindDirection'], axis=1)
    df['PlayDirection'] = df['PlayDirection'].apply(lambda x: x.strip() == 'right')
    df['Team'] = df['Team'].apply(lambda x: x.strip()=='home')

    df['GameWeather'] = df['GameWeather'].str.lower()
    indoor = "indoor"
    df['GameWeather'] = df['GameWeather'].apply(lambda x: indoor if not pd.isna(x) and indoor in x else x)
    df['GameWeather'] = df['GameWeather'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
    df['GameWeather'] = df['GameWeather'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
    df['GameWeather'] = df['GameWeather'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
    df['GameWeather'] = df['GameWeather'].apply(map_weather)

    df['IsRusher'] = df['NflId'] == df['NflIdRusher']
    df.drop(['NflId', 'NflIdRusher'], axis=1, inplace=True)

    df['X'] = df.apply(lambda row: row['X'] if row['PlayDirection'] else 120-row['X'], axis=1)
    df['Orientation'] = df.apply(lambda row: new_orientation(row['Orientation'], row['PlayDirection']), axis=1)
    df['Dir'] = df.apply(lambda row: new_orientation(row['Dir'], row['PlayDirection']), axis=1)
    
    df['YardsLeft'] = df.apply(lambda row: 100-row['YardLine'] if row['HomeField'] else row['YardLine'], axis=1)
    df['YardsLeft'] = df.apply(lambda row: row['YardsLeft'] if row['PlayDirection'] else 100-row['YardsLeft'], axis=1)
    if 'Yards' in df.columns:
        # Dropping outliers in training
        df.drop(df.index[(df['YardsLeft']<df['Yards']) | (df['YardsLeft']-100>df['Yards'])], inplace=True)
    return df

def sort_df(df):
    df = df.sort_values(by=['PlayId', 'Team', 'IsRusher', 'JerseyNumber']).reset_index(drop=True)
    return df

def make_x(df):
    df = sort_df(df)

    cols_delete = ['GameId', 'PlayId', 'IsRusher', 'Team']
    df = df.drop(cols_delete, axis=1)

    # Fill nan
    df = df.fillna(-999)#, method='pad')

    # Text features
    text_cols = []
    for col in df.columns:
        if df[col].dtype =='object':
            text_cols.append(col)

    df = df.drop(text_cols, axis=1)

    # Player features
    cols_player = ['X',
         'Y',
         'S',
         'A',
         'Dis',
         'Orientation',
         'Dir',
         'JerseyNumber',
         'PlayerHeight',
         'PlayerWeight',
         'PlayerBMI',
         'PlayerAge']
    X = np.array(df[cols_player]).reshape(-1, len(cols_player)*22)
    assert df[cols_player].shape[0] == X.shape[0] * 22
    assert df[cols_player].shape[1] == X.shape[1] / 22

    # Play features
    cols_play = list(df.drop(cols_player+(['Yards'] if 'Yards' in df.columns else []), axis=1).columns)
    X_play_col = np.zeros(shape=(X.shape[0], len(cols_play)))
    for i, col in enumerate(cols_play):
        X_play_col[:, i] = df[col][::22]

    assert X.shape[0] == X_play_col.shape[0]
    X = np.concatenate([X, X_play_col], axis=1)

    return X

def make_y(X, df):
    y = np.zeros(shape=(X.shape[0], 199))
    for i, yard in enumerate(df['Yards'][::22]):
        y[i, yard+99:] = np.ones(shape=(1, 100-yard))
    return y



In [4]:
df_train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})

In [5]:
df_train_preprocessed = preprocess_features(df_train)

In [6]:
X_train = make_x(df_train_preprocessed)
y_train = make_y(X_train, df_train_preprocessed)

In [7]:
X_train.shape, y_train.shape

((22946, 295), (22946, 199))

In [8]:
# Turn yard cdf to yard number, \in (-99, 99)
y_train_yard_num = cdf_to_yard(y_train)
assert y_train_yard_num[0] == 8
print(y_train_yard_num)

[ 8  3  5 ...  4  2 11]


# Train

In [9]:
X_train, X_val, y_train_num, y_val_num = train_test_split(X_train, y_train_yard_num,
                                        test_size=0.2)

In [10]:
train_data = lgb.Dataset(X_train, y_train_num,
                       categorical_feature='auto')
val_data = lgb.Dataset(X_val, y_val_num,
                       categorical_feature='auto')

In [11]:
def lgb_cprs_eval(preds, train_data):
    yards_true = train_data.label
    eval_name, is_higher_better = 'mean crps', False
    pred_cdfs = yard_to_cdf(preds)
    true_cdfs = yard_to_cdf(yards_true)
    errors = crps(true_cdfs, pred_cdfs)
    eval_result = np.mean(errors)
    return eval_name, eval_result, is_higher_better

In [13]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': {'mae'},
    'metric_freq': 5,
    'early_stopping_round': 100,
    'max_bin': 255,
#     'max_depth': 4,
    'learning_rate': 0.01,
#     'num_leaves': 31,
    'tree_learner': 'serial',
#     'bagging_fraction': 0.5,
#     'bagging_freq': 5,
    'verbose': 0,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0
}
num_iterations = 1000
bst = lgb.train(lgb_params, train_data, num_iterations, valid_sets=[val_data], feval=lgb_cprs_eval)



[1]	valid_0's l1: 3.4434	valid_0's mean crps: 0.0173153
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's l1: 3.44116	valid_0's mean crps: 0.0173153
[3]	valid_0's l1: 3.43881	valid_0's mean crps: 0.0173153
[4]	valid_0's l1: 3.4364	valid_0's mean crps: 0.0173153
[5]	valid_0's l1: 3.43363	valid_0's mean crps: 0.0173153
[6]	valid_0's l1: 3.43126	valid_0's mean crps: 0.0173153
[7]	valid_0's l1: 3.42867	valid_0's mean crps: 0.0173153
[8]	valid_0's l1: 3.42648	valid_0's mean crps: 0.0173153
[9]	valid_0's l1: 3.42414	valid_0's mean crps: 0.0173153
[10]	valid_0's l1: 3.42191	valid_0's mean crps: 0.0173153
[11]	valid_0's l1: 3.4201	valid_0's mean crps: 0.0173153
[12]	valid_0's l1: 3.41782	valid_0's mean crps: 0.0173153
[13]	valid_0's l1: 3.41562	valid_0's mean crps: 0.0173153
[14]	valid_0's l1: 3.41382	valid_0's mean crps: 0.0173153
[15]	valid_0's l1: 3.41152	valid_0's mean crps: 0.0173153
[16]	valid_0's l1: 3.40914	valid_0's mean crps: 0.0173153
[17]	valid_0's l1: 3.

[143]	valid_0's l1: 3.25976	valid_0's mean crps: 0.0163005
[144]	valid_0's l1: 3.25928	valid_0's mean crps: 0.0163016
[145]	valid_0's l1: 3.25888	valid_0's mean crps: 0.0163081
[146]	valid_0's l1: 3.25832	valid_0's mean crps: 0.0163005
[147]	valid_0's l1: 3.25781	valid_0's mean crps: 0.0163005
[148]	valid_0's l1: 3.2573	valid_0's mean crps: 0.0163081
[149]	valid_0's l1: 3.25672	valid_0's mean crps: 0.0163103
[150]	valid_0's l1: 3.2563	valid_0's mean crps: 0.0163037
[151]	valid_0's l1: 3.25589	valid_0's mean crps: 0.016307
[152]	valid_0's l1: 3.25556	valid_0's mean crps: 0.0163037
[153]	valid_0's l1: 3.25482	valid_0's mean crps: 0.0162928
[154]	valid_0's l1: 3.25434	valid_0's mean crps: 0.0162906
[155]	valid_0's l1: 3.25411	valid_0's mean crps: 0.0162873
[156]	valid_0's l1: 3.25383	valid_0's mean crps: 0.0162873
[157]	valid_0's l1: 3.25328	valid_0's mean crps: 0.0162906
[158]	valid_0's l1: 3.25281	valid_0's mean crps: 0.0162851
[159]	valid_0's l1: 3.25241	valid_0's mean crps: 0.0162873


[285]	valid_0's l1: 3.21643	valid_0's mean crps: 0.0160968
[286]	valid_0's l1: 3.21612	valid_0's mean crps: 0.0160903
[287]	valid_0's l1: 3.21589	valid_0's mean crps: 0.0160815
[288]	valid_0's l1: 3.21575	valid_0's mean crps: 0.0160859
[289]	valid_0's l1: 3.21564	valid_0's mean crps: 0.0160837
[290]	valid_0's l1: 3.21547	valid_0's mean crps: 0.0160826
[291]	valid_0's l1: 3.21518	valid_0's mean crps: 0.0160793
[292]	valid_0's l1: 3.21504	valid_0's mean crps: 0.016076
[293]	valid_0's l1: 3.21487	valid_0's mean crps: 0.0160738
[294]	valid_0's l1: 3.21478	valid_0's mean crps: 0.0160793
[295]	valid_0's l1: 3.21464	valid_0's mean crps: 0.0160738
[296]	valid_0's l1: 3.21455	valid_0's mean crps: 0.016076
[297]	valid_0's l1: 3.21444	valid_0's mean crps: 0.0160749
[298]	valid_0's l1: 3.21422	valid_0's mean crps: 0.016076
[299]	valid_0's l1: 3.21419	valid_0's mean crps: 0.0160749
[300]	valid_0's l1: 3.21411	valid_0's mean crps: 0.0160782
Did not meet early stopping. Best iteration is:
[300]	valid

# SUBMIT

In [14]:
env = nflrush.make_env()

In [15]:
def get_predictions(df_test, y_test, model):
    df_test_preprocessed = preprocess_features(df_test)
    X_test = make_x(df_test_preprocessed)    
    yard_pred = model.predict(X_test)
    y_pred = yard_to_cdf(yard_pred)
    return y_pred

In [16]:
iter_test = env.iter_test()

In [20]:
for df_test, y_cdf_test in tqdm.tqdm(iter_test):
    y_pred = get_predictions(df_test, y_cdf_test, bst)
    env.predict(pd.DataFrame(data=y_pred,columns=y_cdf_test.columns))

1631it [03:24,  7.83it/s]

KeyboardInterrupt: 

In [None]:
env.write_submission_file()