In [1]:
#https://www.kaggle.com/pschale/mlb-pitch-data-20152018

import pandas as pd
import numpy as np

#setting up dataset 'scherzer' to have desired features
pitches = pd.read_csv('pitches.csv')
pitches = pitches.drop(pitches.columns[:26], axis=1)
pitches = pitches.drop('event_num', axis=1)
pitches = pitches.drop('type', axis=1)

#Only selecting pitches from Max Scherzer
scherzer_id = 453286
at_bats = pd.read_csv('atbats.csv')
at_bats = at_bats.drop(['o', 'event', 'p_throws'], axis=1)
just_scherzer = at_bats.loc[at_bats['pitcher_id'] == scherzer_id]

scherzer = pitches.merge(just_scherzer, how='inner', on=['ab_id'])


#Removing intentional walks and Two-Seam Fastballs
scherzer = scherzer[scherzer.code != 'I']
scherzer = scherzer[scherzer.pitch_type != 'FT']
scherzer = scherzer[scherzer.pitch_type != 'UN']

#reads in games csv, which gives home team, start_time, and weather features
games = pd.read_csv('games.csv')
games = games.drop(['attendance', 'away_final_score', 'away_team', 'date', 'elapsed_time', 'home_final_score', 'umpire_1B',
       'umpire_2B', 'umpire_3B', 'umpire_HP', 'venue_name','delay'], axis=1)

scherzer = scherzer.merge(games, how='inner', on=['g_id'])

scherzer = scherzer.reset_index(drop=True)

print(scherzer.shape)
scherzer

(13434, 23)


Unnamed: 0,zone,code,pitch_type,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,...,g_id,inning,p_score,pitcher_id,stand,top,home_team,start_time,weather,wind
0,13.0,C,FF,0.0,2.015001e+09,0.0,0.0,0.0,1.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
1,11.0,F,FF,0.0,2.015001e+09,0.0,1.0,0.0,2.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
2,11.0,B,FF,0.0,2.015001e+09,0.0,2.0,0.0,3.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
3,14.0,B,CH,0.0,2.015001e+09,1.0,2.0,0.0,4.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
4,11.0,B,FF,0.0,2.015001e+09,2.0,2.0,0.0,5.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
5,11.0,F,CH,0.0,2.015001e+09,3.0,2.0,0.0,6.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
6,2.0,F,FF,0.0,2.015001e+09,3.0,2.0,0.0,7.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
7,11.0,B,FF,0.0,2.015001e+09,3.0,2.0,0.0,8.0,0.0,...,201500010,1,0,453286,L,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
8,12.0,B,FF,0.0,2.015001e+09,0.0,0.0,0.0,1.0,1.0,...,201500010,1,0,453286,R,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"
9,4.0,S,FF,0.0,2.015001e+09,1.0,0.0,0.0,2.0,1.0,...,201500010,1,0,453286,R,True,was,4:09 PM,"76 degrees, sunny","14 mph, Out to CF"


In [2]:
#Converting the code column to bionary, a '1' if the batter reached base without an out and '0' otherwise
def convert_code_column(dataframe):
    if np.issubdtype(dataframe['code'].dtype, np.number):
        return dataframe
    else:
        dataframe = dataframe.reset_index(drop=True)
        target_col = []
        for i in range(len(dataframe['code'])):
            pitch = dataframe['code'][i]
            num_balls = int(dataframe['b_count'][i])

            if (pitch == 'B' or pitch == '*B') and num_balls == 3:
                target_col.append(1)

            elif pitch == 'D' or pitch == 'E' or pitch == 'H':
                target_col.append(1)

            else:
                target_col.append(0)

        dataframe['code'] = target_col
        return dataframe


#function that vectorizes a column of classes
def featurize(dataframe, column_name):
    class_dct = {}
    count = 0
    dataframe = dataframe.reset_index(drop=True)
    for instance in dataframe[column_name]:
        count += 1
        header = column_name + '_' + str(instance)
        if not header in class_dct.keys():
            class_dct[header] = np.zeros(count - 1)
        
        for key in class_dct.keys():
            if key == header:
                class_dct[key] = np.append(class_dct[key], 1)
            else:
                class_dct[key] = np.append(class_dct[key], 0)
                
    dataframe = dataframe.drop([column_name], axis=1)
    
    for key in class_dct.keys():
        dataframe[key] = class_dct[key]
        
    return dataframe

#function designed to combine the zone and pitch-type features
def Combine_Features(dataframe, zone, pitch_type):
    if not 'zone' in dataframe.columns or not 'pitch_type' in dataframe.columns:
        return dataframe
    else:
        class_dct = {}
        for i in range(dataframe['zone'].size):
            this_zone = str(dataframe['zone'][i])
            this_pitch = str(dataframe['pitch_type'][i])

            header = 'zone_' + this_zone + '_pitch_' + this_pitch
            if not header in class_dct.keys():
                class_dct[header] = np.zeros(i)

            for key in class_dct.keys():
                if key == header:
                    class_dct[key] = np.append(class_dct[key], 1)
                else:
                    class_dct[key] = np.append(class_dct[key], 0)

        dataframe = dataframe.drop([zone, pitch_type], axis=1)

        for key in class_dct.keys():
            dataframe[key] = class_dct[key]

        return dataframe

#function that creates two new features, 'pitch_height' and 'pitch_width' which gives the coordinates of the pitch
def create_coordinates(dataframe):
    if 'pitch_height' in dataframe.columns:
        return dataframe
    else:
        heights = []
        widths = []
        width_dct = {1.0:-1,2.0:0,3.0:1,4.0:-1,5.0:0,6.0:1,7.0:-1,8.0:0,9.0:1,11.0:-2,12.0:2,13.0:-2,14.0:2}
        height_dct = {1.0:1,2.0:1,3.0:1,4.0:0,5.0:0,6.0:0,7.0:-1,8.0:-1,9.0:-1,11.0:2,12.0:2,13.0:-2,14.0:-2}
        
        for i in range(len(dataframe['zone'])):
            zone = dataframe['zone'][i]
            stand =  dataframe['stand'][i]
            if stand == 'L':
                if zone in width_dct.keys():
                    widths.append(width_dct[zone])
                    heights.append(height_dct[zone])
                else:
                    widths.append(0)
                    heights.append(0)
            else:
                if zone in width_dct.keys():
                    widths.append(-width_dct[zone])
                    heights.append(height_dct[zone])
                else:
                    widths.append(0)
                    heights.append(0)

        dataframe['pitch_height'] = heights
        dataframe['pitch_width'] = widths
        return dataframe

#Creates an extra 10 features where each feature is a pitch type and either height or width.  This feature will
#be either 0 if the instance is a different pitch type or its corresponding height or width if the instance is the
#same pitch type
def creat_zone_coordinates(dataframe):
    if 'FF_height' in dataframe.columns:
        return dataframe
    
    dataframe = dataframe.reset_index(drop=True)
    all_pitches = {'FF_height':[], 'FF_width': [], 'CH_height': [], 'CH_width': [], 'SL_height': [], 'SL_width': [], 'FC_height': [], 'FC_width': [], 'CU_height': [], 'CU_width': []}
    for i in range(dataframe['pitch_type'].size):
        pitch_type = dataframe['pitch_type'][i]
        for key in all_pitches.keys():
            p_type = key[:2]
            coord = key[-6:]
            if p_type == pitch_type:
                if coord == 'height':
                    all_pitches[key].append(dataframe['pitch_height'][i])
                else:
                    all_pitches[key].append(dataframe['pitch_width'][i])
            else:
                all_pitches[key].append(0)
                
    for ky in all_pitches.keys():
        dataframe[ky] = all_pitches[ky]
    
    return dataframe

#creates a new feature 'run_dif' which is the score differnce between pitchers team and batters team and
#'run_total'
def run_dif(dataframe):
    if 'run_dif' in dataframe:
        return dataframe
    dataframe['run_dif'] = dataframe['b_score'] - dataframe['p_score']
    dataframe['run_total'] = dataframe['b_score'] + dataframe['p_score']
    dataframe = dataframe.drop(['b_score', 'p_score'], axis=1)
    return dataframe

#makes "home_team" feature numerical by changing home games to 1 and away games to zero
def home_team(dataframe):
    first_entry = dataframe['home_team'][0]
    if first_entry ==0 or first_entry == 1:
        return dataframe
    
    home_games = []
    for game in dataframe['home_team']:
        if game == 'was':
            home_games.append(1)
        else:
            home_games.append(0)
    dataframe['home_team'] = home_games
    return dataframe

#function that takes in a time like "4:05 pm" and converts it to a number on a scale from 1 to 25
def convert_time(string):
    output = 0
    try:  
        tim, day = string.split(' ')
        hour, minute = tim.split(':')
        hour = int(hour)
        minute = int(minute)/60
        if day == 'pm':
            hour += 12
            
        output = hour+minute
    except:
        pass
    
    return output

#function that converts start_time column        
def start_time(dataframe):
    first_entry = dataframe['start_time'][0]
    if isinstance(first_entry, float):
        return dataframe

    new_times = []
    for tim in dataframe['start_time']:
        new_times.append(convert_time(tim))

    dataframe['start_time'] = new_times

    return dataframe

#function that takes in a time like "4:05 pm" and converts it to a number on a scale from 1 to 25
def convert_weather(string):
    output = 70
    try:  
        output = int(string.split(' ')[0])
    except:
        pass
    
    return output

#function that converts start_time column        
def weather(dataframe):
    first_entry = dataframe['weather'][0]
    if not isinstance(first_entry, str):
        return dataframe

    new_temps = []
    for temp in dataframe['weather']:
        new_temps.append(convert_weather(temp))

    dataframe['weather'] = new_temps

    return dataframe

#function that converts the wind column
def wind(dataframe):

    

In [3]:
scherzer = convert_code_column(scherzer)
scherzer = create_coordinates(scherzer)
scherzer = creat_zone_coordinates(scherzer)
scherzer = run_dif(scherzer)
scherzer = home_team(scherzer)
scherzer = start_time(scherzer)
scherzer = weather(scherzer)

In [4]:
#given the id of the batter, the zone of the pitch, and the type of pitch this function will give the
#batter a score based on their historical performance with that pitch.  If the batter has faced less than
#100 such pitches, the function returns Max Scherzer's score for that pitch
player_dct = {}
def look_up_hitter(batter_id, zone, pitch):
    if batter_id in player_dct.keys():
        this_batter = player_dct[batter_id]
    else:
        just_this_batter = at_bats.loc[at_bats['batter_id'] == batter_id]
        this_batter = pitches.merge(just_this_batter, how='inner', on=['ab_id'])

        #this_batter = combine_zones(this_batter)
        player_dct[batter_id] = this_batter
        
    this_batter = this_batter.loc[this_batter['zone'] == zone]
    this_batter = this_batter.loc[this_batter['pitch_type'] == pitch]
    
    if this_batter.shape[0] < 100:
        maxs_av = scherzer.loc[scherzer['zone'] == zone]
        maxs_av = maxs_av.loc[maxs_av['pitch_type'] == pitch]
        return maxs_av.code.sum()/maxs_av.code.size
    
    else:
        this_batter = this_batter[this_batter.code != 'I']
        this_batter = this_batter.reset_index(drop=True)
        this_batter = convert_code_column(this_batter)
        
        return this_batter.code.sum()/this_batter.code.size
    
look_up_hitter(451594,5,'FF')

0.15263157894736842

In [5]:
#Adds the feature 'batter_score' to dataframe, calculated by the look_up_hitter_function
#player_dct keeps track of the dataframe for each hitter so that it does not to be recalculated for each new pitch

def batter_scores(dataframe):
    if 'batter_score' in dataframe.columns:
        return dataframe
    
    dataframe = dataframe.reset_index(drop=True)
    batter_scores = []
    for i in range(dataframe['batter_id'].size):
        if i%100 == 0:
            print(i)
        batter_id = dataframe['batter_id'][i]
        zone = dataframe['zone'][i]
        pitch = dataframe['pitch_type'][i]
        score = look_up_hitter(batter_id, zone, pitch)
        batter_scores.append(score)
    
    dataframe['batter_score'] = batter_scores
    return dataframe


scherzer = batter_scores(scherzer)

scherzer.columns

0
100
200




300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400


Index(['zone', 'code', 'pitch_type', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'batter_id', 'g_id', 'inning',
       'pitcher_id', 'stand', 'top', 'home_team', 'start_time', 'weather',
       'wind', 'pitch_height', 'pitch_width', 'FF_height', 'FF_width',
       'CH_height', 'CH_width', 'SL_height', 'SL_width', 'FC_height',
       'FC_width', 'CU_height', 'CU_width', 'run_dif', 'run_total',
       'batter_score'],
      dtype='object')

In [6]:
print('The fraction of pitches resulting the runner reaching:',scherzer.code.sum()/scherzer.code.size)

The fraction of pitches resulting the runner reaching: 0.06297454220634212


In [67]:
to_be_vectorized = ['pitch_type','p_throws', 'stand']
to_be_dropped = ['zone', 'ab_id', 'batter_id', 'g_id', 'p_throws', 'pitcher_id', 'top']
ready_to_go = ['code', 'b_count', 's_count', 'outs', 'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'inning', 'home_team',
              'start_time', 'weather']
print(scherzer.wind)
scherzer.columns

0        14 mph, Out to CF
1        14 mph, Out to CF
2        14 mph, Out to CF
3        14 mph, Out to CF
4        14 mph, Out to CF
5        14 mph, Out to CF
6        14 mph, Out to CF
7        14 mph, Out to CF
8        14 mph, Out to CF
9        14 mph, Out to CF
10       14 mph, Out to CF
11       14 mph, Out to CF
12       14 mph, Out to CF
13       14 mph, Out to CF
14       14 mph, Out to CF
15       14 mph, Out to CF
16       14 mph, Out to CF
17       14 mph, Out to CF
18       14 mph, Out to CF
19       14 mph, Out to CF
20       14 mph, Out to CF
21       14 mph, Out to CF
22       14 mph, Out to CF
23       14 mph, Out to CF
24       14 mph, Out to CF
25       14 mph, Out to CF
26       14 mph, Out to CF
27       14 mph, Out to CF
28       14 mph, Out to CF
29       14 mph, Out to CF
               ...        
13404     4 mph, Out to LF
13405     4 mph, Out to LF
13406     4 mph, Out to LF
13407     4 mph, Out to LF
13408     4 mph, Out to LF
13409     4 mph, Out to LF
1

Index(['zone', 'code', 'pitch_type', 'ab_id', 'b_count', 's_count', 'outs',
       'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'batter_id', 'g_id', 'inning',
       'pitcher_id', 'stand', 'top', 'home_team', 'start_time', 'weather',
       'wind', 'pitch_height', 'pitch_width', 'FF_height', 'FF_width',
       'CH_height', 'CH_width', 'SL_height', 'SL_width', 'FC_height',
       'FC_width', 'CU_height', 'CU_width', 'run_dif', 'run_total'],
      dtype='object')