# Data Parsing and Investigation

Parse the JSONL files for relevant info and store in a more useful format.
Event data stored in a pandas DF
Tracking data stored in a dict, keyed by file name (no suffix)

In [1]:
%matplotlib notebook
import json
import pickle
import matplotlib.pyplot as plt
import pandas as pd
plt.ion()
import os
import util

In [2]:
import numpy as np
from numpy.polynomial import Polynomial
import math


def bat_elevation_angle(pitch_frames):
    """
    determine the angle (in radians) the bat makes against the horizontal plane at the moment of contact
    :param pitch_frames: PitchFrames object
    """
    t_contact = pitch_frames.hit_time
    handle = ([f for f in pitch_frames.handle if f.time == t_contact])[0]
    head = ([f for f in pitch_frames.head if f.time == t_contact])[0]
    x = head.x-handle.x
    y = head.y-handle.x
    z = head.z-handle.z
    bat_length = math.sqrt(x**2 + y**2 + z**2) 
    return math.degrees(math.asin(z/bat_length))


def bat_forward_tilt_angle(pitch_frames):
    """
    determine the angle (in radians) the bat makes against the plane of the strike box (x-z plane)
    :param pitch_frames: PitchFrames object
    """
    t_contact = pitch_frames.hit_time
    handle = ([f for f in pitch_frames.handle if f.time == t_contact])[0]
    head = ([f for f in pitch_frames.head if f.time == t_contact])[0]
    x = head.x-handle.x
    y = head.y-handle.y
    z = head.z-handle.z
    bat_length = math.sqrt(x**2 + y**2 + z**2) # calculated from the tracking data
    return math.degrees(math.asin(y/bat_length))

def get_speed(frames, hit_time):
    """determine the speed of the head at the moment of contact
    :param frames: list of Frames for the bat head OR handle
    returns: speed, [x,y,z components] in miles per hour"""

    x, y, z, t = util.extract_coords_from_frames(frames, hit_time-0.01, hit_time+0.01)
    x_speed = 0.682 * (Polynomial.fit(t, x, 1)).convert().coef[1]  # have to .convert() otherwise values are scaled all funky
    y_speed = 0.682 * (Polynomial.fit(t, y, 1)).convert().coef[1]  # 0.682 converts from feet per second to miles per hour
    z_speed = 0.682 * (Polynomial.fit(t, z, 1)).convert().coef[1]
    
    return np.sqrt(x_speed**2 + y_speed**2 + z_speed**2), [x_speed, y_speed, z_speed]



In [3]:
names = [
    "Alice Atkins",
    "Bob Batterson",
    "Clara Clapham",
    "David Dunkins",
    "Ernest Engels",
    "Frankie Fisher",
    "Gabby Galway",
    "Harry Hitterson",
    "Ivy Isles",
    "James Jacobson",
    "Kyle Kevins",
    "Liam Lowes",
    "Mia Martinez",
    "Nolan Nash",
    "Olivia Olsen",
    "Peter Parker",
    "Quinn Quest",
    "Ryan Reeve",
    "Stacey Statkin",
    "Talyor Tomlinson",
    "Umberto Umbridge",
    "Vivian Valley",
    "Will Watkins",
    "Xavier Xi",
    "Yuri Yi",
    "Zoey Zoolander",
    "Arthur Bills",
    "Chris Dane",
    "Evan French",
    "Gus Howe",
    "John Knowles",
    "Larry Michaels",
    "Nina Owens"
]

In [6]:
from collections import defaultdict

names_iter = iter(names)
name_lookup = defaultdict(lambda: next(names_iter))
df_rows = []
tracking_frames = {} # to look up tracking data by file id
for fname in os.listdir("private/raw wisd data/"):
    with open("private/raw wisd data/"+fname, "r") as f:
        dat = json.load(f)
    fileID = fname[:-6] # drop .jsonl suffix
    
    # tracking data
    tracking_frames[fileID] = util.PitchFrames.from_dict(dat["samples_ball"], dat["samples_bat"])
    
    # event data
    row_dict = {
        # "fileID": fileID,
        "result": dat["summary_acts"]["pitch"]["result"],
        "action": dat["summary_acts"]["pitch"]["action"],
        "has_hit": bool(len([True for event in dat["events"] if "Hit" in event.values()])),
        "has_bat": tracking_frames[fileID].has_bat()
    }
    if row_dict["action"] == {}: # "no action" is encoded in JSON as an empty dict, instead of None.
        row_dict["action"] = None
        
    if not row_dict["has_hit"]:   # include extra metrics on hits
        continue

    # name_lookup is a defaultdict, so it will pull the next un-used name when it encounters a new ID
    batterID = dat["events"][0]["personId"]["mlbId"]
    row_dict["batter_name"] = name_lookup[batterID]
    
    row_dict["spray_angle"], row_dict["launch_angle"] = dat["events"][0]["start"]["angle"]
    exit_velocity = dat["summary_acts"]["hit"]["speed"]["mph"]
    row_dict["exit_velocity"] = exit_velocity
        
    # bat properties at contact
    try:
        row_dict["bat_elevation"] = bat_elevation_angle(tracking_frames[fileID])
        row_dict["bat_forward_tilt"] = bat_forward_tilt_angle(tracking_frames[fileID])
        row_dict["head_speed"], [row_dict["head_speed_x"], row_dict["head_speed_y"], row_dict["head_speed_z"]] = get_speed(tracking_frames[fileID].head, tracking_frames[fileID].hit_time)
        row_dict["handle_speed"], [row_dict["handle_speed_x"], row_dict["handle_speed_y"], row_dict["handle_speed_z"]] = get_speed(tracking_frames[fileID].handle, tracking_frames[fileID].hit_time)
    except Exception as e:
        # row_dict["parse_error"] = repr(e)] # store errors in df for inspection
        continue
    df_rows.append(row_dict)

event_df = pd.DataFrame.from_records(df_rows)


  off = (old[1]*new[0] - old[0]*new[1])/oldlen
  scl = newlen/oldlen
  return off + scl*x


In [7]:
# load xxBA model and add to df
with open("xxba.pickle", "rb") as f:
    xxba_model = pickle.load(f)
is_hit = event_df.has_hit & event_df.has_bat
event_df['xxBA'] = np.nan
event_df.loc[is_hit, 'xxBA'] = xxba_model.predict(event_df[is_hit][["exit_velocity", "launch_angle"]])

In [12]:
# only save rows/cols which are needed for Streamlit
# df = event_df[event_df.parse_error.isna()]
# df = df.drop("parse_error", axis=1)
df = event_df
df = df[df.has_bat & df.has_hit]

df.to_csv("data/WISD events.csv", index=False)
# 
# with open("data/WISD tracking.pickle", "wb") as f:
#     pickle.dump(tracking_frames, f)

# Player Stats

In [7]:
# make stats df
def count_hits(result_col):
    counts = result_col.value_counts()
    try:
        return counts["HitIntoPlay"]
    except KeyError:
        return 0


def count_fouls(result_col):
    counts = result_col.value_counts()
    try:
        return counts["Strike"]
    except KeyError:
        return 0
    
df = event_df[event_df.parse_error.isna()]  # only retain rows without parse errors
df = df.drop("parse_error", axis=1)  # remove parse_error column
gp = df.groupby(["batter_name"])
stats = gp.agg(
    hits=pd.NamedAgg(column="result", aggfunc=count_hits),
    fouls=pd.NamedAgg(column="result", aggfunc=count_fouls),
    avg_xxBA=pd.NamedAgg(column="xxBA", aggfunc="mean")
)
stats["pitches_received"] = gp.size()
stats["fair_foul_ratio"] = stats.hits / stats.fouls
stats.replace([np.inf], np.nan, inplace=True)
stats.to_csv("data/WISD stats.csv")

In [8]:
event_df[~event_df.parse_error.isna()]

Unnamed: 0,fileID,result,action,has_hit,has_bat,batterID,batter_name,spray_angle,launch_angle,exit_velocity,...,head_speed,head_speed_x,head_speed_y,head_speed_z,handle_speed,handle_speed_x,handle_speed_y,handle_speed_z,parse_error,xxBA
30,12345634_14070,HitIntoPlay,,True,False,474808052.0,Kyle Kevins,4.649429,10.713578,85.0,...,,,,,,,,,"TypeError(""'NoneType' object is not iterable"")",
979,12345645_23493,Ball,HitByPitch,True,False,174158975.0,Umberto Umbridge,129.770833,-12.589174,34.0,...,,,,,,,,,"TypeError(""'NoneType' object is not iterable"")",
987,12345645_24682,HitIntoPlay,,True,True,174158975.0,Umberto Umbridge,-20.67205,-37.347368,25.0,...,,,,,,,,,LinAlgError('SVD did not converge in Linear Le...,0.269867
1065,12345645_37152,Ball,HitByPitch,True,False,765710437.0,Olivia Olsen,102.967958,46.772069,27.0,...,,,,,,,,,"TypeError(""'NoneType' object is not iterable"")",
1081,12345645_39809,Ball,HitByPitch,True,False,474808052.0,Kyle Kevins,178.276243,21.334844,52.0,...,,,,,,,,,"TypeError(""'NoneType' object is not iterable"")",


In [9]:
event_df.batterID.value_counts()

batterID
797957728.0    23
459722179.0    19
352830460.0    18
719146721.0    18
412098649.0    17
485007791.0    16
719210239.0    14
174158975.0    14
360906992.0    13
765710437.0    12
474808052.0    11
590082479.0    11
451871192.0    11
558675411.0    11
172804761.0    10
618024297.0    10
290569727.0     9
805688901.0     9
545569723.0     9
854238128.0     9
568527038.0     8
849653732.0     7
654287703.0     7
505414610.0     7
686425745.0     6
223971350.0     5
617522563.0     5
432216743.0     5
518481551.0     4
797796542.0     1
563942271.0     1
Name: count, dtype: int64

In [10]:
temp = event_df[event_df.has_hit==True]
temp

Unnamed: 0,fileID,result,action,has_hit,has_bat,batterID,batter_name,spray_angle,launch_angle,exit_velocity,...,head_speed,head_speed_x,head_speed_y,head_speed_z,handle_speed,handle_speed_x,handle_speed_y,handle_speed_z,parse_error,xxBA
2,12345634_10282,HitIntoPlay,,True,True,797796542.0,Alice Atkins,27.482589,65.837960,24.0,...,84.317537,-15.306269,82.620339,7.003204,14.193727,-13.724390,-2.990165,-2.040080,,0.072600
3,12345634_10475,HitIntoPlay,,True,True,352830460.0,Bob Batterson,37.200279,-9.146789,96.0,...,67.015054,8.786029,62.691727,21.990237,8.860497,6.891844,1.445608,5.377835,,0.226067
6,12345634_10962,Strike,Foul,True,True,412098649.0,Clara Clapham,-47.134220,29.573660,86.0,...,61.891088,-20.159199,58.377782,-4.018468,9.710465,3.260890,9.145496,-0.140149,,0.053333
10,12345634_11227,HitIntoPlay,,True,True,797957728.0,David Dunkins,8.567380,36.176988,89.0,...,73.607843,-17.802723,71.394181,2.012081,13.674726,-9.934519,-4.624728,8.180181,,0.024267
14,12345634_11462,Strike,Foul,True,True,459722179.0,Ernest Engels,-78.757461,-44.256670,67.0,...,69.373395,-25.820465,61.270731,19.795684,8.934200,-7.247050,1.746598,-4.924388,,0.166000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1176,12345645_7688,HitIntoPlay,,True,True,474808052.0,Kyle Kevins,-1.583902,70.512403,82.0,...,73.022539,-27.290506,66.899896,10.579384,12.581675,-11.548788,4.030805,2.945614,,0.002733
1177,12345645_7717,Strike,Foul,True,True,618024297.0,Xavier Xi,-129.760609,64.208298,76.0,...,77.365996,10.794742,75.625736,12.235964,7.306535,4.552919,5.714542,-0.019950,,0.003000
1180,12345645_8063,HitIntoPlay,,True,True,352830460.0,Bob Batterson,15.014613,31.432694,103.0,...,44.457782,-21.139064,39.110093,-0.187209,30.589680,-21.115396,22.075472,-1.594411,,0.700000
1181,12345645_8329,HitIntoPlay,,True,True,618024297.0,Xavier Xi,22.143770,12.095113,101.0,...,66.105160,9.156616,63.516617,15.864676,5.900034,3.928948,2.432455,3.668369,,0.934000


In [11]:
event_df.has_hit.value_counts()

has_hit
False    876
True     320
Name: count, dtype: int64

In [12]:
event_df.result.value_counts()

result
Strike         563
Ball           432
HitIntoPlay    181
Pickoff         20
Name: count, dtype: int64

In [13]:
event_df.action.value_counts()

action
Called           610
Foul             192
BallInDirt        23
ThrowToFirst      19
FoulTip           16
HitByPitch         8
Blocked            6
ThrowToSecond      1
Pitchout           1
Name: count, dtype: int64

# Broken Bat Tracking
Some files have an error with their bat tracking, where the head and handle have the same coordinates at every time step.
Those files have since been removed from the dataset, but this cell an always be used to detect them if somehow they sneak back in

In [14]:
bat_df = event_df[event_df.has_bat==True]
bad_files = []
for _, fileID in bat_df.fileID.items():
    frames = tracking_frames[fileID]
    head_x, head_y, head_z, _ = util.extract_coords_from_frames(frames.head)
    handle_x, handle_y, handle_z, _ = util.extract_coords_from_frames(frames.handle)
    allx = all([a == b for (a, b) in zip(head_x, handle_x)])
    ally = all([a == b for (a, b) in zip(head_y, handle_y)])
    allz = all([a == b for (a, b) in zip(head_z, handle_z)])
    if allx and ally and allz:
        bad_files.append(fileID)
print(len(bat_df))
print(len(bad_files))

570
0
