Baseball Prediction: 5a - Getting (Raw) Individual Pitcher Data

    - In the previous notebook, we compared our simple, hitting-only model to the Las Vegas odds. We concluded that incorporating the starting pitcher information would be a crucial next step to improve our model.

    - In this notebook we will learn how to scrape individual, game-level, pitching data from retrosheet. We will write a loop to go through and download the data. This will enable us to augment our game-level dataframe with features derived from the previous performance of the starting pitcher.

    - Let's start by going to retrosheet and finding the stats for CC Sabathia (one of my favorite pitchers from my childhood).

www.retrosheet.org

In [21]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

import lxml
import html5lib
from urllib.request import urlopen
import time

from bs4 import BeautifulSoup
import requests

In [22]:
url = 'https://www.retrosheet.org/boxesetc/2007/Ksabac0010072007.htm'
page = requests.get(url)

In [23]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Kdescr.htm">Read Me</a></pre>
<head>
<title>The 2007 CLE A Regular Season Pitching Log for CC Sabathia</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><a href="https://www.retrosheet.org"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></a></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a>
<li><a href="https://www.retrosheet.org/searches/search.html">Search</a></li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a>
<li><a href="https://www.r

In [24]:
#soup1 = list(soup.children)[-1]
#soup1

In [25]:
#soup2 = list(soup1.children)[-1]
#soup2

In [26]:
#soup3 = list(soup2.children)
#soup3

In [27]:
#index_num = np.where(["Opponent" in str(x) for x in soup3])[0][0]
#index_num

In [28]:
#soup4 = soup3[index_num]
#soup4

In [29]:
#soup5 = list(soup4.children)
#soup5

In [30]:
#for i in range(12):
 #   print(soup5[i].get_text().split())

In [31]:
## Given the url that refers to a specific pitcher and season
## we scrape the data and process it a bit

def get_season_pitching_data(url):    
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)[-1]
    body = list(html.children)[-1]
    sec_next = list(body.children)
    secnum = np.where(["Opponent" in str(x) for x in sec_next])[0][0]
    key_section = sec_next[secnum]
    working_part = list(key_section.children)
    p_header = working_part[0].strip().split()
    mod_header= ['at_vs','Opponent','League', 'GS', 'CG', 'SHO', 'GF', 'SV', 'IP', 'H',
            'BFP', 'HR', 'R', 'ER', 'BB', 'IB', 'SO', 'SH', 'SF', 'WP', 'HBP',
            'BK', '2B', '3B', 'GDP', 'ROE', 'W', 'L', 'ERA']

    date_list = []
    day_href_list = []
    for k in range(1,len(working_part),4):
        date_list.append(working_part[k].get_text().strip())
        day_href_list.append(working_part[k].attrs['href'])

    dblhead_num_list = []
    for k in range(2,len(working_part),4):
        dblhead_num_list.append(working_part[k].strip())

    game_href_list = []
    for k in range(3,len(working_part),4):
        game_href_list.append(working_part[k].attrs['href'])

    main_data_matrix = []
    for k in range(4,len(working_part),4):
        main_data_row = (working_part[k].strip().split())[:29]
        main_data_matrix.append(main_data_row)

    out_df = pd.DataFrame(main_data_matrix, columns = mod_header)
    out_df['date'] = date_list
    out_df['dblhead_num'] = dblhead_num_list
    return(out_df)

In [32]:
#url = 'https://www.retrosheet.org/boxesetc/S/Psabac001.htm'
#page = requests.get(url)
#sup = BeautifulSoup(page.content, 'html.parser')
#sup

In [33]:
#sup2 = list(sup.children)[2]
#sup2

In [34]:
#sup3 = list(sup2.children)[5]
#sup3

In [35]:
# Plan - find the <pre> tag that starts with 'Pitching Record' (after stripping whitespace)
# Get the href attribute for all the <a> tags with the word "Daily"

#pre_tags = [x for x in sup3.find_all('pre')]
#pre_tag_text = [x.get_text().strip() for x in pre_tags]
#pre_tag_text

In [36]:
#np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]

In [37]:
#ind = np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]
#a_tags = pre_tags[ind].find_all('a')
#a_tags

In [38]:
#links = [x.attrs['href'] for x in a_tags if x.get_text()=='Daily']
#links

In [39]:
### Get the links to the pitcher-season tables given the pitcher id

def get_daily_season_links(pitcher_id):
    letter = pitcher_id.upper()[0]
    url_prefix = 'https://www.retrosheet.org/boxesetc/'
    url = url_prefix+letter+'/P'+pitcher_id+'.htm'
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)
    body = list(html[2].children)[5]
    pre_texts = [x for x in body.find_all('pre')]
    secnum = np.where([x.get_text().strip().startswith('Pitching Record') for x in pre_texts])[0][0]
    a_pre_texts = pre_texts[secnum].find_all('a')
    daily_season_links = [url_prefix+x.attrs['href'][3:] for x in a_pre_texts if x.get_text()=='Daily']
    return(daily_season_links)

In [40]:
# Selects one of the links and displays season data

#get_season_pitching_data(get_daily_season_links('sabac001')[4])

In [41]:
# Get all the data for a particular pitcher
def get_full_pitching_data(pitcher_id):
    link_list = get_daily_season_links(pitcher_id)
    df_pitching = pd.DataFrame()
    for url in link_list:
        df_pitching = pd.concat((df_pitching, get_season_pitching_data(url)))
    return(df_pitching)

In [42]:
def append_2023_data(pitcher_id):
    # Get the last link from get_daily_season_links
    last_link = get_daily_season_links(pitcher_id)[-1]
    
    # Print the last link to check its format
    #print("Last Link:", last_link)
    
    # Check if the last link contains '2023'
    if '2023' in last_link:
        print("Found data for 2023.")
        
        # Load the CSV file for the pitcher ID
        filename = f'/Volumes/CharmedXi/beatVegas/SP_data/pitching_data_{pitcher_id}.csv'
        
        # Check if the CSV file already contains 2023 data
        if os.path.isfile(filename):
            df = pd.read_csv(filename)
            if '2023' in df['date'].values:
                print('2023 data already in file... Skipping...')
                return
        
        # Fetch data from the last link
        new_data = get_season_pitching_data(last_link)
        
        # Append data from the last link to the DataFrame
        if os.path.isfile(filename):
            df = pd.concat([df, new_data], ignore_index=True)
        else:
            df = new_data
        
        # Save the updated DataFrame back to the CSV file
        df.to_csv(filename, index=False)
        print(f'Data appended and saved successfully for pitcher ID {pitcher_id}.')
    else:
        print(f'No pitching data found for {pitcher_id} in 2023.')


In [43]:
cc_data = get_full_pitching_data('sabac001')

In [44]:
cc_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 0 to 22
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   at_vs        561 non-null    object
 1   Opponent     561 non-null    object
 2   League       561 non-null    object
 3   GS           561 non-null    object
 4   CG           561 non-null    object
 5   SHO          561 non-null    object
 6   GF           561 non-null    object
 7   SV           561 non-null    object
 8   IP           561 non-null    object
 9   H            561 non-null    object
 10  BFP          561 non-null    object
 11  HR           561 non-null    object
 12  R            561 non-null    object
 13  ER           561 non-null    object
 14  BB           561 non-null    object
 15  IB           561 non-null    object
 16  SO           561 non-null    object
 17  SH           561 non-null    object
 18  SF           561 non-null    object
 19  WP           561 non-null    o

In [45]:
cc_data.sample(5)

Unnamed: 0,at_vs,Opponent,League,GS,CG,SHO,GF,SV,IP,H,BFP,HR,R,ER,BB,IB,SO,SH,SF,WP,HBP,BK,2B,3B,GDP,ROE,W,L,ERA,date,dblhead_num
5,VS,TOR,A,1,0,0,0,0,6,4,22,3,3,3,0,0,9,0,0,0,0,0,0,0,0,0,1,0,3.38,5- 1-2007,
21,VS,TEX,A,1,0,0,0,0,6,1,23,0,0,0,3,0,7,0,0,0,0,0,0,0,1,2,1,0,3.32,8-12-2018,
8,VS,BOS,A,1,0,0,0,0,7,4,27,1,1,1,3,0,5,0,0,0,0,0,0,0,1,0,0,0,3.43,5-18-2010,
1,AT,BAL,A,1,0,0,0,0,6,6,27,0,3,2,4,0,3,0,0,1,0,0,0,0,1,0,0,0,1.64,4- 9-2017,
19,VS,OAK,A,1,0,0,0,0,3,1,12,1,1,1,2,0,2,0,0,0,1,0,0,0,1,0,0,0,4.93,8-30-2019,


LOAD IN GAME LEVEL DATA

In [46]:
df = pd.read_csv('df_bp3.csv', low_memory=False)

In [47]:
start_pitchers_h = df.pitcher_start_id_h.unique()
start_pitchers_v = df.pitcher_start_id_v.unique()
len(start_pitchers_h), len(start_pitchers_v)

(1872, 1895)

In [48]:
start_pitchers_all = np.union1d(start_pitchers_h.astype(str), start_pitchers_v.astype(str))
len(start_pitchers_all), start_pitchers_all[:25]

(2047,
 array(['abadf001', 'abboa001', 'abboc001', 'abbop001', 'aceva001',
        'acevj002', 'adama002', 'adamc002', 'adamt001', 'adcon001',
        'adenn001', 'adlet001', 'adonj001', 'affej001', 'agrad001',
        'ainsk001', 'akink001', 'albea001', 'albem001', 'alcar001',
        'alcas001', 'alexa001', 'alexj001', 'alexs001', 'alext001'],
       dtype='<U8'))

In [49]:
'''
for p_id in start_pitchers_all:
    append_2023_data(p_id)
'''

'\nfor p_id in start_pitchers_all:\n    append_2023_data(p_id)\n'

In [50]:
# run this for everyone in the list - may take a bit to run... (This is the data I provide for you in SP_Data)
'''
for p_id in start_pitchers_all:
    print(p_id)
    try:
        df_temp = get_full_pitching_data(p_id)
    except (AttributeError, AssertionError, ValueError):
        pass

    fname_out = '/Volumes/CharmedXi/beatVegas/SP_new/pitching_data_'+p_id+'.csv'
    df_temp.to_csv(fname_out, index=False)    
'''

"\nfor p_id in start_pitchers_all:\n    print(p_id)\n    try:\n        df_temp = get_full_pitching_data(p_id)\n    except (AttributeError, AssertionError, ValueError):\n        pass\n\n    fname_out = '/Volumes/CharmedXi/beatVegas/SP_new/pitching_data_'+p_id+'.csv'\n    df_temp.to_csv(fname_out, index=False)    \n"

In [51]:
def count_files(directory):
    """
    Count the number of files in a directory.
    
    Args:
    - directory (str): The path to the directory.
    
    Returns:
    - int: The number of files in the directory.
    """
    # Initialize a counter variable
    file_count = 0
    
    # Iterate through the files in the directory
    for file in os.listdir(directory):
        # Check if the path is a file
        if os.path.isfile(os.path.join(directory, file)):
            # Increment the counter
            file_count += 1
    
    return file_count

In [54]:
# Example usage:
directory_path = '/Volumes/CharmedXi/beatVegas/SP_2000'
print("Number of files in directory:", count_files(directory_path))

Number of files in directory: 2047
