# Objective

In this notebook, I'll extract 10% of the game plays to use them for training neural networks.

# Load Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn.metrics import matthews_corrcoef
from pathlib import Path

from nflutils.dataprep import *

import glob

SEED = 19951204
CREATE_FRAMES_DF = True
EXTRACT_FRAMES = False
VIEWS = ["Sideline"]

# Read Files

In [2]:
LS = !ls
IS_KAGGLE = 'init.sh' not in LS
IS_KAGGLE

False

In [3]:
if IS_KAGGLE:
    # Read in data files
    BASE_DIR = Path("../input/nfl-player-contact-detection")
    OUT_DIR = Path("/kaggle/working/")
else:
    BASE_DIR = Path("nfl-player-contact-detection")
    OUT_DIR = Path("nfl-player-contact-detection/frames")

In [4]:
# Labels and sample submission
labels = pd.read_csv(BASE_DIR/"train_labels.csv", parse_dates=["datetime"])

ss = pd.read_csv(BASE_DIR/"sample_submission.csv")

# Player tracking data
tr_tracking = pd.read_csv(
    BASE_DIR/"train_player_tracking.csv", parse_dates=["datetime"]
)
te_tracking = pd.read_csv(
    BASE_DIR/"test_player_tracking.csv", parse_dates=["datetime"]
)

# Baseline helmet detection labels
tr_helmets = pd.read_csv(BASE_DIR/"train_baseline_helmets.csv")
te_helmets = pd.read_csv(BASE_DIR/"test_baseline_helmets.csv")

# Video metadata with start/stop timestamps
tr_video_metadata = pd.read_csv(
    BASE_DIR/"train_video_metadata.csv",
    parse_dates=["start_time", "end_time", "snap_time"],
)

In [5]:
df_combo = compute_distance(labels, tr_tracking)

In [6]:
from sklearn.model_selection import GroupKFold

np.random.seed(SEED)

kf = GroupKFold()
kf_dict = {}

for i, (train_index, test_index) in enumerate(kf.split(tr_video_metadata, None, tr_video_metadata['game_key'])):
    print(f"Fold {i}:")
    kf_dict[i] = {'train_games': list(tr_video_metadata.iloc[train_index].game_play.unique()),
                  'val_games': list(tr_video_metadata.iloc[test_index].game_play.unique())}

Fold 0:
Fold 1:
Fold 2:
Fold 3:
Fold 4:


I should save the validation data games in order to use them further on during validation in different strategies.

In [7]:
import pickle

with open('kf_dict', 'wb') as f:
    pickle.dump(kf_dict, f)

In [8]:
with open('kf_dict', 'rb') as f:
    kf_dict = pickle.load(f)

Now let's extract the validation set as a starter. I'll extract from only sideline view for starters.

In [9]:
import subprocess, os
from tqdm.notebook import tqdm

val_games = kf_dict[0]['val_games']

val_game_plays = tr_video_metadata.query('game_key in @val_games').game_play

g_paths = []

views_regex = '|'.join(VIEWS)

for g in tqdm(val_game_plays):
    paths = glob.glob(f'{(BASE_DIR/"train"/g).as_posix()}_[{views_regex}]*')
    g_paths.extend(paths)

0it [00:00, ?it/s]

In [10]:
if EXTRACT_FRAMES:
    for g_path in tqdm(g_paths[:4]):
        game_play = g_path.split('/')[-1].split('/')[-1][:-4]
        (OUT_DIR/'validation'/game_play).mkdir(parents=True, exist_ok=True)
        # print("mkdir -p $OUT_DIR/validation/$game_play && chmod 777 $OUT_DIR/validation/$game_play")
        # Source: https://www.kaggle.com/code/zzy990106/nfl-2-5d-cnn-baseline-inference
        !echo "ffmpeg -i $g_path -q:v 2 -f image2 $OUT_DIR/validation/$game_play/frame-%04d.jpg -hide_banner -loglevel error"
        break

<s>Now let's extract 10% of train game_plays</s>

Since I'm only creating the dataframe now, I'll use the time to create the dataframe needed for all subsequent training.

In [11]:
train_games = kf_dict[0]['train_games']

train_game_plays = tr_video_metadata.query('game_key in @train_games').game_play#.sample(frac=0.1, replace=False, random_state=SEED).game_play

In [12]:
if EXTRACT_FRAMES:
    !mkdir -p $OUT_DIR/train
    !chmod 777 $OUT_DIR/train

    for g in train_game_plays:
        g_paths = !ls $BASE_DIR/train/$g*
        for g_path in g_paths:
            game_play = g_path.split('/')[-1].split('/')[-1][:-4]
            !mkdir -p $OUT_DIR/train/$game_play && chmod 777 $OUT_DIR/train/$game_play
            !ffmpeg -i "$g_path" -q:v 2 -f image2 "$OUT_DIR/train/$game_play/frame-%04d.jpg" -hide_banner -loglevel error

In [13]:
%%time

df_combo_2 = merge_tracking_and_helmets(df_combo.query('distance <= 1.6'), tr_helmets)
df_combo_2 = calc_two_players_helmets_center(df_combo_2)

CPU times: user 11.7 s, sys: 3.66 s, total: 15.3 s
Wall time: 15.4 s


In [26]:
df_combo_2 = df_combo_2.query("view != 'Endzone2'")
df_combo_2 = df_combo_2[~df_combo_2.view.isna()]
df_combo_2 = df_combo_2[~df_combo_2.left_2.isna()]

In [28]:
df_combo_2.to_parquet("df_combo_with_helmets.parquet", index=False)

In [29]:
df_combo.to_parquet("df_combo.parquet", index=False)