Baseball Prediction: 5a - Getting (Raw) Individual Pitcher Data

    - In the previous notebook, we compared our simple, hitting-only model to the Las Vegas odds. We concluded that incorporating the starting pitcher information would be a crucial next step to improve our model.

    - In this notebook we will learn how to scrape individual, game-level, pitching data from retrosheet. We will write a loop to go through and download the data. This will enable us to augment our game-level dataframe with features derived from the previous performance of the starting pitcher.

    - Let's start by going to retrosheet and finding the stats for CC Sabathia (one of my favorite pitchers from my childhood).

www.retrosheet.org

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

import lxml
import html5lib
from urllib.request import urlopen
import time

from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://www.retrosheet.org/boxesetc/2007/Ksabac0010072007.htm'
page = requests.get(url)

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Kdescr.htm">Read Me</a></pre>
<head>
<title>The 2007 CLE A Regular Season Pitching Log for CC Sabathia</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><a href="https://www.retrosheet.org"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></a></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a>
<li><a href="https://www.retrosheet.org/searches/search.html">Search</a></li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a>
<li><a href="https://www.r

In [None]:
#soup1 = list(soup.children)[-1]
#soup1

In [None]:
#soup2 = list(soup1.children)[-1]
#soup2

In [None]:
#soup3 = list(soup2.children)
#soup3

In [None]:
#index_num = np.where(["Opponent" in str(x) for x in soup3])[0][0]
#index_num

In [None]:
#soup4 = soup3[index_num]
#soup4

In [None]:
#soup5 = list(soup4.children)
#soup5

In [None]:
#for i in range(12):
 #   print(soup5[i].get_text().split())

In [4]:
## Given the url that refers to a specific pitcher and season
## we scrape the data and process it a bit

def get_season_pitching_data(url):    
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)[-1]
    body = list(html.children)[-1]
    sec_next = list(body.children)
    secnum = np.where(["Opponent" in str(x) for x in sec_next])[0][0]
    key_section = sec_next[secnum]
    working_part = list(key_section.children)
    p_header = working_part[0].strip().split()
    mod_header= ['at_vs','Opponent','League', 'GS', 'CG', 'SHO', 'GF', 'SV', 'IP', 'H',
            'BFP', 'HR', 'R', 'ER', 'BB', 'IB', 'SO', 'SH', 'SF', 'WP', 'HBP',
            'BK', '2B', '3B', 'GDP', 'ROE', 'W', 'L', 'ERA']

    date_list = []
    day_href_list = []
    for k in range(1,len(working_part),4):
        date_list.append(working_part[k].get_text().strip())
        day_href_list.append(working_part[k].attrs['href'])

    dblhead_num_list = []
    for k in range(2,len(working_part),4):
        dblhead_num_list.append(working_part[k].strip())

    game_href_list = []
    for k in range(3,len(working_part),4):
        game_href_list.append(working_part[k].attrs['href'])

    main_data_matrix = []
    for k in range(4,len(working_part),4):
        main_data_row = (working_part[k].strip().split())[:29]
        main_data_matrix.append(main_data_row)

    out_df = pd.DataFrame(main_data_matrix, columns = mod_header)
    out_df['date'] = date_list
    out_df['dblhead_num'] = dblhead_num_list
    return(out_df)

In [None]:
#url = 'https://www.retrosheet.org/boxesetc/S/Psabac001.htm'
#page = requests.get(url)
#sup = BeautifulSoup(page.content, 'html.parser')
#sup

In [None]:
#sup2 = list(sup.children)[2]
#sup2

In [None]:
#sup3 = list(sup2.children)[5]
#sup3

In [None]:
# Plan - find the <pre> tag that starts with 'Pitching Record' (after stripping whitespace)
# Get the href attribute for all the <a> tags with the word "Daily"

#pre_tags = [x for x in sup3.find_all('pre')]
#pre_tag_text = [x.get_text().strip() for x in pre_tags]
#pre_tag_text

In [None]:
#np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]

In [None]:
#ind = np.where([x.startswith('Pitching Record') for x in pre_tag_text])[0][0]
#a_tags = pre_tags[ind].find_all('a')
#a_tags

In [None]:
#links = [x.attrs['href'] for x in a_tags if x.get_text()=='Daily']
#links

In [5]:
### Get the links to the pitcher-season tables given the pitcher id

def get_daily_season_links(pitcher_id):
    letter = pitcher_id.upper()[0]
    url_prefix = 'https://www.retrosheet.org/boxesetc/'
    url = url_prefix+letter+'/P'+pitcher_id+'.htm'
    time.sleep(1)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    html=list(soup.children)
    body = list(html[2].children)[5]
    pre_texts = [x for x in body.find_all('pre')]
    secnum = np.where([x.get_text().strip().startswith('Pitching Record') for x in pre_texts])[0][0]
    a_pre_texts = pre_texts[secnum].find_all('a')
    daily_season_links = [url_prefix+x.attrs['href'][3:] for x in a_pre_texts if x.get_text()=='Daily']
    return(daily_season_links)

In [None]:
# Selects one of the links and displays season data

#get_season_pitching_data(get_daily_season_links('sabac001')[4])

In [6]:
# Get all the data for a particular pitcher
def get_full_pitching_data(pitcher_id):
    link_list = get_daily_season_links(pitcher_id)
    df_pitching = pd.DataFrame()
    for url in link_list:
        df_pitching = pd.concat((df_pitching, get_season_pitching_data(url)))
    return(df_pitching)

In [15]:
def append_2023_data(pitcher_id):
    # Get the last link from get_daily_season_links
    last_link = get_daily_season_links(pitcher_id)[-1]
    
    # Print the last link to check its format
    print("Last Link:", last_link)
    
    # Check if the last link contains '2023'
    if '2023' in last_link:
        print("Found data for 2023.")
        
        # Load the CSV file for the pitcher ID
        filename = f'/Volumes/CharmedXi/beatVegas/SP_data/pitching_data_{pitcher_id}.csv'
        
        # Check if the CSV file already contains 2023 data
        if os.path.isfile(filename):
            df = pd.read_csv(filename)
            if '2023' in df['date'].values:
                print('2023 data already in file... Skipping...')
                return
        
        # Fetch data from the last link
        new_data = get_season_pitching_data(last_link)
        
        # Append data from the last link to the DataFrame
        if os.path.isfile(filename):
            df = pd.concat([df, new_data], ignore_index=True)
        else:
            df = new_data
        
        # Save the updated DataFrame back to the CSV file
        df.to_csv(filename, index=False)
        print(f'Data appended and saved successfully for pitcher ID {pitcher_id}.')
    else:
        print(f'No pitching data found for {pitcher_id} in 2023.')


In [8]:
cc_data = get_full_pitching_data('sabac001')

In [9]:
cc_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 0 to 22
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   at_vs        561 non-null    object
 1   Opponent     561 non-null    object
 2   League       561 non-null    object
 3   GS           561 non-null    object
 4   CG           561 non-null    object
 5   SHO          561 non-null    object
 6   GF           561 non-null    object
 7   SV           561 non-null    object
 8   IP           561 non-null    object
 9   H            561 non-null    object
 10  BFP          561 non-null    object
 11  HR           561 non-null    object
 12  R            561 non-null    object
 13  ER           561 non-null    object
 14  BB           561 non-null    object
 15  IB           561 non-null    object
 16  SO           561 non-null    object
 17  SH           561 non-null    object
 18  SF           561 non-null    object
 19  WP           561 non-null    o

In [10]:
cc_data.sample(5)

Unnamed: 0,at_vs,Opponent,League,GS,CG,SHO,GF,SV,IP,H,BFP,HR,R,ER,BB,IB,SO,SH,SF,WP,HBP,BK,2B,3B,GDP,ROE,W,L,ERA,date,dblhead_num
8,VS,TEX,A,1,0,0,0,0,2.1,7,15,0,6,6,1,0,4,0,0,1,0,0,0,0,0,0,0,1,5.47,5-23-2015,
7,AT,TEX,A,1,0,0,0,0,6.0,5,29,0,5,3,4,0,2,0,1,0,0,0,0,0,0,3,1,0,2.89,5- 8-2011,
3,VS,OAK,A,1,0,0,0,0,6.2,6,31,1,7,6,4,0,2,1,0,0,0,0,0,0,0,1,0,0,4.81,4-22-2009,
16,AT,MIN,A,1,1,0,0,0,9.0,4,34,1,1,1,1,0,5,1,0,0,2,0,0,0,0,0,1,0,3.27,7- 3-2003,
33,AT,TB,A,1,0,0,0,0,2.2,8,22,0,9,5,5,1,3,0,0,0,1,0,2,1,0,1,0,1,3.37,10- 2-2009,


LOAD IN GAME LEVEL DATA

In [11]:
df = pd.read_csv('df_bp3.csv', low_memory=False)

In [12]:
start_pitchers_h = df.pitcher_start_id_h.unique()
start_pitchers_v = df.pitcher_start_id_v.unique()
len(start_pitchers_h), len(start_pitchers_v)

(1872, 1895)

In [13]:
start_pitchers_all = np.union1d(start_pitchers_h.astype(str), start_pitchers_v.astype(str))
len(start_pitchers_all), start_pitchers_all[:25]

(2047,
 array(['abadf001', 'abboa001', 'abboc001', 'abbop001', 'aceva001',
        'acevj002', 'adama002', 'adamc002', 'adamt001', 'adcon001',
        'adenn001', 'adlet001', 'adonj001', 'affej001', 'agrad001',
        'ainsk001', 'akink001', 'albea001', 'albem001', 'alcar001',
        'alcas001', 'alexa001', 'alexj001', 'alexs001', 'alext001'],
       dtype='<U8'))

In [None]:
for p_id in start_pitchers_all:
    append_2023_data(p_id)

In [17]:
# run this for everyone in the list - may take a bit to run... (This is the data I provide for you in SP_Data)

for p_id in start_pitchers_all:
    print(p_id)
    try:
        df_temp = get_full_pitching_data(p_id)
    except (AttributeError, AssertionError, ValueError):
        pass

    fname_out = '/Volumes/CharmedXi/beatVegas/SP_new/pitching_data_'+p_id+'.csv'
    df_temp.to_csv(fname_out, index=False)    

abadf001
abboa001
abboc001
abbop001
aceva001
acevj002
adama002
adamc002
adamt001
adcon001
adenn001
adlet001
adonj001
affej001
agrad001
ainsk001
akink001
albea001
albem001
alcar001
alcas001
alexa001
alexj001
alexs001
alext001
allak001
allel002
allel003
almac001
alvaa001
alvah001
alvaj003
alvaj004
alvav001
alvaw001
alzoa001
andeb002
andeb004
andec001
andec002
anded003
andei001
andej002
andes002
andet002
andet003
andrc001
andrm001
ankir001
antot001
appik001
aquij001
archc001
ariaa002
arihk001
armat002
armer001
armss001
arnoj001
arrij001
arrob001
arror001
asenm001
ashba002
ashba003
ashcg001
ashea001
assaj001
astae001
astap001
atchs001
atill001
atkim001
augeb001
aumop001
austj002
avilp001
axeld001
axfoj001
aybam001
backb001
bacsm001
badeb001
baekc001
baezd001
baezm001
bailh001
bakeb001
bakes002
baldj002
balec001
balej001
balfg001
banda001
bankj001
bankt001
bannb001
banum001
barac001
barcl001
bardd001
bardl001
barnc002
barnj002
barnm001
barod001
barrj003
barrk001
bartj002
bassa001
bassb001
b

In [18]:
def count_files(directory):
    """
    Count the number of files in a directory.
    
    Args:
    - directory (str): The path to the directory.
    
    Returns:
    - int: The number of files in the directory.
    """
    # Initialize a counter variable
    file_count = 0
    
    # Iterate through the files in the directory
    for file in os.listdir(directory):
        # Check if the path is a file
        if os.path.isfile(os.path.join(directory, file)):
            # Increment the counter
            file_count += 1
    
    return file_count

In [20]:
# Example usage:
directory_path = '/Volumes/CharmedXi/beatVegas/SP_new'
print("Number of files in directory:", count_files(directory_path))

Number of files in directory: 2047
