Baseball Prediction: 5a - Getting (Raw) Individual Pitcher Data

In the previous notebook, we compared our simple, hitting-only model to the Las Vegas odds. We concluded that incorporating the starting pitcher information would be a crucial next step to improve our model.

In this notebook we will learn how to scrape individual, game-level, pitching data from retrosheet. We will write a loop to go through and download the data. This will enable us to augment our game-level dataframe with features derived from the previous performance of the starting pitcher.

Let's start by going to retrosheet and finding the stats for Corey Kluber (one of my favorite pitchers from my childhood).

www.retrosheet.org

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

import lxml
import html5lib
from urllib.request import urlopen
import time

from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.retrosheet.org/boxesetc/2016/Kklubc0010062016.htm'
page = requests.get(url)

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

In [None]:
soup1 = list(soup.children)[-1]
soup1

In [None]:
soup2 = list(soup1.children)[-1]
soup2

In [None]:
soup3 = list(soup2.children)
soup3

In [None]:
index_num = np.where(["Opponent" in str(x) for x in soup3])[0][0]
index_num

In [None]:
soup4 = soup3[index_num]
soup4

In [None]:
soup5 = list(soup4.children)
soup5

In [None]:
for i in range(12):
    print(soup5[i].get_text().split())

In [None]:
## Given the url that refers to a specific pitcher and season
## we scrape the data and process it a bit
def get_season_pitching_data(url):    
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)[-1]
    body = list(html.children)[-1]
    sec_next = list(body.children)
    secnum = np.where(["Opponent" in str(x) for x in sec_next])[0][0]
    key_section = sec_next[secnum]
    working_part = list(key_section.children)
    p_header = working_part[0].strip().split()
    mod_header= ['at_vs','Opponent','League', 'GS', 'CG', 'SHO', 'GF', 'SV', 'IP', 'H',
            'BFP', 'HR', 'R', 'ER', 'BB', 'IB', 'SO', 'SH', 'SF', 'WP', 'HBP',
            'BK', '2B', '3B', 'GDP', 'ROE', 'W', 'L', 'ERA']

    date_list = []
    day_href_list = []
    for k in range(1,len(working_part),4):
        date_list.append(working_part[k].get_text().strip())
        day_href_list.append(working_part[k].attrs['href'])

    dblhead_num_list = []
    for k in range(2,len(working_part),4):
        dblhead_num_list.append(working_part[k].strip())

    game_href_list = []
    for k in range(3,len(working_part),4):
        game_href_list.append(working_part[k].attrs['href'])

    main_data_matrix = []
    for k in range(4,len(working_part),4):
        main_data_row = (working_part[k].strip().split())[:29]
        main_data_matrix.append(main_data_row)

    out_df = pd.DataFrame(main_data_matrix, columns = mod_header)
    out_df['Date'] = date_list
    out_df['dblhead_num'] = dblhead_num_list
    return(out_df)

In [None]:
get_season_pitching_data(url)

In [None]:
url = 'https://www.retrosheet.org/boxesetc/K/Pklubc001.htm'
page = requests.get(url)
sup = BeautifulSoup(page.content, 'html.parser')
sup

In [None]:
sup2 = list(sup.children)[2]
sup2

In [None]:
sup3 = list(sup2.children)[5]
sup3

In [None]:
# Plan - find the <pre> tag that starts with 'Pitching Record' (after stripping whitespace)
# Get the href attribute for all the <a> tags with the word "Daily"

pre_tags = [x for x in sup3.find_all('pre')]
pre_tag_text = [x.get_text().strip() for x in pre_tags]
pre_tag_text

In [None]:
np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]

In [None]:
ind = np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]
a_tags = pre_tags[ind].find_all('a')
a_tags

In [None]:
links = [x.attrs['href'] for x in a_tags if x.get_text()=='Daily']
links

In [None]:
### Get the links to the pitcher-season tables given the pitcher id
def get_daily_season_links(pitcher_id):
    letter = pitcher_id.upper()[0]
    url_prefix = 'https://www.retrosheet.org/boxesetc/'
    url = url_prefix+letter+'/P'+pitcher_id+'.htm'
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)
    body = list(html[2].children)[5]
    pre_texts = [x for x in body.find_all('pre')]
    secnum = np.where([x.get_text().strip().startswith('Pitching Record') for x in pre_texts])[0][0]
    a_pre_texts = pre_texts[secnum].find_all('a')
    daily_season_links = [url_prefix+x.attrs['href'][3:] for x in a_pre_texts if x.get_text()=='Daily']
    return(daily_season_links)

In [None]:
get_daily_season_links('klubc001')

In [None]:
get_season_pitching_data(get_daily_season_links('klubc001')[4])

In [None]:
# Get all the data for a particular pitcher
def get_full_pitching_data(pitcher_id):
    link_list = get_daily_season_links(pitcher_id)
    df_pitching = pd.DataFrame()
    for url in link_list:
        df_pitching = pd.concat((df_pitching, get_season_pitching_data(url)))
    return(df_pitching)

In [None]:
ck_data = get_full_pitching_data('klubc001')

In [None]:
ck_data.info()

In [None]:
ck_data.sample(5)

LOAD IN GAME LEVEL DATA

In [None]:
df = pd.read_csv('df_bp3.csv')

In [None]:
start_pitchers_h = df.pitcher_start_id_h.unique()
start_pitchers_v = df.pitcher_start_id_v.unique()
len(start_pitchers_h), len(start_pitchers_v)

In [143]:
start_pitchers_all = np.union1d(start_pitchers_h.astype(str), start_pitchers_v.astype(str))
len(start_pitchers_all), start_pitchers_all[:25]

(6212,
 array(['aased001', 'abadf001', 'abboc001', 'abbog001', 'abboj001',
        'abbok001', 'abbop001', 'abera101', 'abert101', 'abert102',
        'aberw101', 'ableh101', 'abrej001', 'aceva001', 'acevj001',
        'acevj002', 'ackej001', 'acket101', 'acklf101', 'acose101',
        'acosj101', 'adama002', 'adama101', 'adamb102', 'adamb104'],
       dtype='<U8'))

In [144]:
start_pitchers_all[196]

'bacsm001'

In [None]:
# run this for everyone in the list - may take a bit to run...

for p_id in start_pitchers_all:
    print(p_id)
    try:
        df_temp = get_full_pitching_data(p_id)
    except (AttributeError, AssertionError, ValueError):
        pass

    fname_out = '/Users/antiprotons/Desktop/DA/SP_Data/pitching_data_'+p_id+'.csv'
    df_temp.to_csv(fname_out, index=False)