This notebook summarises plays to add extra features to model completion probability.

The output from this notebook goes to merge_clean_data to merge back to the original df_merged which will be used for the final creation of the random forest.

In [1]:
library(tidyverse)
library(data.table)
library(dplyr, warn.conflicts = FALSE)
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose




In [2]:
na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }

In [3]:
nearest_stats <- fread('../input/calculate-nearest-stats/nearest_stats.csv', drop='V1')
colnames(nearest_stats)
head(nearest_stats)
nrow(nearest_stats)

gameId,playId,frameId,off_nflId,nflId,displayName,dist_from_off,nearest_is_target,distanceFromBall,off_distanceFromBall,ball_behind_db,diff_in_speed,diff_in_acceleration,diff_in_dir,receiver_behind_db,receiver_distance_from_qb,num_of_receivers_nearest,dist_rate_of_change
<int>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
2018090600,75,11,2533040,79848,Malcolm Jenkins,1.920339,0,9.726813,9.334077,0,0.01,0.01,258.48,1.89,9.538433,2,0.0
2018090600,75,12,2533040,79848,Malcolm Jenkins,1.922134,0,9.736735,9.334077,0,0.01,0.01,276.29,1.89,9.530519,2,0.0
2018090600,75,13,2533040,79848,Malcolm Jenkins,1.912302,0,9.762797,9.365452,0,-0.02,-0.29,280.46,1.88,9.533651,2,0.0
2018090600,75,14,2533040,79848,Malcolm Jenkins,1.908743,0,9.814622,9.353614,0,-0.07,-0.5,34.84,1.88,9.544658,2,-0.006038243
2018090600,75,15,2533040,79848,Malcolm Jenkins,1.889047,0,9.884452,9.351048,0,-0.22,-1.2,9.95,1.86,9.563472,2,-0.017213601
2018090600,75,16,2533040,79848,Malcolm Jenkins,1.859516,0,9.927724,9.341333,0,-0.4,-1.57,29.29,1.83,9.600417,2,-0.027603487


In [4]:
# omit Inf vals
# split nearest_stats by gameId and playId
nearest_stats <- nearest_stats %>%

group_split(gameId, playId)

In [5]:
# create a function to determine if there's a receiver MORE open on a play than the receiver who the defender is nearest
play_summary_stats <- function(play_df){
    
    summary <- play_df %>%
    
    group_by(gameId, playId, frameId) %>%
    
    summarise(max_open_off = max(dist_from_off))
    
    play_df <- left_join(play_df,
                        summary,
                        by=c('gameId', 'playId', 'frameId'))
    
    play_df <- play_df %>%
    
    mutate(his_off_most_open = if_else(max_open_off == dist_from_off, 1, 0))
    
    play_df = subset(play_df, select = -c(max_open_off) )
    
    return(play_df)
}

In [6]:
nearest_stats <- lapply(nearest_stats, function(x) 
  {
    y <- tryCatch(play_summary_stats(x), error=function(err) NA)
    return(y)
  })
nearest_stats <- na.omit.list(nearest_stats)
nearest_stats <- bind_rows(nearest_stats)
nrow(nearest_stats)

In [7]:
write.csv(nearest_stats, 'nearest_stats.csv')