In [1]:
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
pd.options.mode.chained_assignment = None
from pitch_path.processing.data_processing import PitcherDataProcessor
import logging
import sys
from scipy.spatial import distance
logger = logging.getLogger()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [2]:
data = "../data"
inputs = f"{data}/output/processed"
pitcher_files = [join(inputs, f) for f in listdir(inputs) if isfile(join(inputs, f))]

In [3]:
ppd = PitcherDataProcessor(file_name=pitcher_files[0], is_processed_file=True)

INFO:pitch_path.utils.data_processing:Initializing from processed pitcher file: ../data/output/processed/sched_id429650_pitcher680689
INFO:pitch_path.utils.data_processing:File name: ../data/output/processed/sched_id429650_pitcher680689
INFO:pitch_path.utils.data_processing:Pitcher id: 680689
INFO:pitch_path.utils.data_processing:Sched id: 429650
INFO:pitch_path.utils.data_processing:Throws: R
INFO:pitch_path.utils.data_processing:Front Leg: l
INFO:pitch_path.utils.data_processing:Finished processing.....


In [4]:
df = ppd.get_pitcher_df()

# Exploring some features

### Velocity and Distance

In [5]:
p1 = df[df['astros_pitch_id'] == 1]

In [6]:
wrist_cols = [x for x in p1.columns if f"wrist" in x]
prev_wrist_cols = [f"prev_{x}" for x in wrist_cols]
cols_to_shift = wrist_cols + ['time']

In [7]:
[p1.loc[0, wrist_cols].to_list()]

[[-1.0854, 60.3719, 5.1761]]

In [8]:
distance.cdist([p1.loc[0, wrist_cols].to_list()], [p1.loc[1, wrist_cols].to_list()], 'euclidean')

array([[0.01948551]])

In [9]:
for col in cols_to_shift:
    p1[f"prev_{col}"] = p1[col].shift(1)

In [10]:
p1['distance_to_prev'] = p1.apply(lambda x: 0 if np.isnan(x['prev_wrist_x'])
                                  else distance.cdist([x[wrist_cols].to_list()], [x[prev_wrist_cols].to_list()], 'euclidean')[0][0], axis=1)

In [11]:
p1['get_avg_velocity'] = p1.apply(lambda x: x['distance_to_prev']/(x['time'] - x['prev_time']), axis=1)

## Arm Angle

## Features location at time

In [12]:
def get_col_values_at_time(df: pd.DataFrame, time_col: str, joint_loc_cols: list) -> pd.DataFrame:
    loc_df = df.loc[df[time_col] == 1, joint_loc_cols].reset_index().drop(columns=['index'])
    loc_df.columns = [f"{x}_{time_col}" for x in loc_df.columns]
    return loc_df

In [13]:
p1 = df[df['astros_pitch_id'] == 1]
wrist_cols = [x for x in p1.columns if f"wrist" in x]

In [14]:
p1.head()

Unnamed: 0,astros_pitch_id,sched_id,pitcher_id,bats,throws,time,shoulder_x,shoulder_y,shoulder_z,elbow_x,...,wrist_y,wrist_z,knee_x,knee_y,knee_z,start,release,time_25,time_5,time_75
0,1,429650,680689,L,R,-1.016,-0.2081,60.561501,5.9219,-0.4616,...,60.371899,5.1761,-1.0037,59.581402,3.0212,1.0,0,0,0,0
1,1,429650,680689,L,R,-1.013,-0.2056,60.560799,5.925,-0.4618,...,60.388,5.1865,-1.0312,59.582802,3.0469,0.0,0,0,0,0
2,1,429650,680689,L,R,-1.01,-0.1998,60.558701,5.927,-0.4661,...,60.402802,5.1984,-1.0579,59.584702,3.0722,0.0,0,0,0,0
3,1,429650,680689,L,R,-1.006,-0.1964,60.556,5.928,-0.4733,...,60.414101,5.2146,-1.0854,59.582802,3.0984,0.0,0,0,0,0
4,1,429650,680689,L,R,-1.003,-0.1956,60.553101,5.9301,-0.4789,...,60.4203,5.2253,-1.1095,59.580601,3.1239,0.0,0,0,0,0


In [15]:
wrist_start = get_col_values_at_time(p1, 'start', wrist_cols)
wrist_25 = get_col_values_at_time(p1, 'time_25', wrist_cols)
wrist_5 = get_col_values_at_time(p1, 'time_5', wrist_cols)
wrist_75 = get_col_values_at_time(p1, 'time_75', wrist_cols)
wrist_release = get_col_values_at_time(p1, 'release', wrist_cols)

## Merge all Feature DFs together
merge DFs together to get a singular feature row per pitch

In [16]:
pitch_feature = pd.merge(wrist_start, wrist_25, how='cross')\
    .merge(wrist_5, how='cross')\
    .merge(wrist_75, how='cross')\
    .merge(wrist_release, how='cross')
pitch_feature['pitcher_id'] = ppd.pitcher_id
pitch_feature['sched_id'] = ppd.sched_id
pitch_feature['astros_pitch_id'] = p1.astros_pitch_id.unique()[0]

In [17]:
pitch_feature.head()

Unnamed: 0,wrist_x_start,wrist_y_start,wrist_z_start,wrist_x_time_25,wrist_y_time_25,wrist_z_time_25,wrist_x_time_5,wrist_y_time_5,wrist_z_time_5,wrist_x_time_75,wrist_y_time_75,wrist_z_time_75,wrist_x_release,wrist_y_release,wrist_z_release,pitcher_id,sched_id,astros_pitch_id
0,-1.0854,60.371899,5.1761,-1.2036,60.391399,5.6939,-0.9734,60.0471,5.0402,0.642,60.049599,4.2122,-1.4514,54.180801,4.6715,680689,429650,1


## Generate all features for df

In [18]:
import pitch_path.utils.features as feat

In [19]:
ppd = PitcherDataProcessor(file_name=pitcher_files[0], is_processed_file=True)
df = ppd.get_pitcher_df()
p1 = df[df['astros_pitch_id'] == 1]

INFO:pitch_path.utils.data_processing:Initializing from processed pitcher file: ../data/output/processed/sched_id429650_pitcher680689
INFO:pitch_path.utils.data_processing:File name: ../data/output/processed/sched_id429650_pitcher680689
INFO:pitch_path.utils.data_processing:Pitcher id: 680689
INFO:pitch_path.utils.data_processing:Sched id: 429650
INFO:pitch_path.utils.data_processing:Throws: R
INFO:pitch_path.utils.data_processing:Front Leg: l
INFO:pitch_path.utils.data_processing:Finished processing.....


In [20]:
wrist_cols = [x for x in df.columns if f"wrist" in x]
prev_wrist_cols = [f"prev_{x}" for x in wrist_cols]
cols_to_shift = wrist_cols + ['time']

In [21]:
for col in cols_to_shift:
    p1[f"prev_{col}"] = p1[col].shift(1)