This is the notebook where I scrape individual starting pitchers' individual game performance

In [25]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
import pandas as pd
import requests
import urllib.request
import numpy as np
from datetime import date
import time

from bs4 import BeautifulSoup

In [27]:
# This is a dataset drawn manually from b-ref.
# There is probably a slick way to scrape this
# but this only has to be updated every once in a while, not all the time
# So manually works for now.
all_starters = pd.read_csv('all_starters.csv')
all_starters.head(5)

Unnamed: 0,name,code
0,Miles Mikolas,mikolmi01
1,Chris Bassitt,bassich01
2,Dylan Cease,ceasedy01
3,Gerrit Cole,colege01
4,Zac Gallen,galleza01


In [29]:
# Initialize df
pitcher_data = pd.DataFrame()

# Note that we're just taking the first 2% of pitchers. Take away the round and 0.02* when you're ready to go full scale.
for i in range(round(0.02*len(all_starters))):

    code = all_starters.loc[i,'code']
    name = all_starters.loc[i,'name']

    # Request setup
    url = f'https://www.baseball-reference.com/players/gl.fcgi?id={code}&t=p&year=2023'
    df = pd.read_html(url)
    df = pd.DataFrame(df[0])

    # # DATA CLEANING # #

    # Drop weird rows
    df = df[pd.to_numeric(df['Rk'], errors = 'coerce').notnull()]

    # Reset index after dropping some rows
    df = df.reset_index()

    # Identify desired columns to keep and drop others
    keep_cols = ['Date','Opp','Dec', 'DR',
                 'IP', 'H', 'ER', 'BB', 'SO', 'HR', 'HBP',
                 'FIP']

    df = df[keep_cols]

    # Now let's make the date column usable
    # Clean away situations where there's a double-header impacting the date
    df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

    # Extract day from date
    days = df['Date'].str.split(n=1,expand = True).loc[:,1]

    # Extract month from date
    months = df['Date'].str.split(n=1,expand = True).loc[:,0]

    # Convert month to numerical month
    months = months.str.replace('Oct','10')
    months = months.str.replace('Sep','9')
    months = months.str.replace('Aug','8')
    months = months.str.replace('Jul','7')
    months = months.str.replace('Jun','6')
    months = months.str.replace('May','5')
    months = months.str.replace('Apr','4')
    months = months.str.replace('Mar','3')

    # Create year series
    df['years'] = '2023'

    # Build date string
    df['Date'] = df['years'] + "-" + months + "-" + days

    # Drop year series
    df = df.drop(columns = ['years'])

    # Clean Decision series
    df['Dec'] = df['Dec'].str[0]

    df['Dec'] = df['Dec'].fillna('0')

    df['Dec'] = df['Dec'].str.replace('W','1')
    df['Dec'] = df['Dec'].str.replace('L','-1')
    df['Dec'] = df['Dec'].str.replace('B','-1')
    df['Dec'] = df['Dec'].str.replace('S','2')
    df['Dec'] = df['Dec'].str.replace('H','0')

    # Make everything a string before the step below. Don't worry, we'll fix this later on.
    # This is because some of our data processing functions require string inputs,
    # and our data scraper might accidentally infer some things as numbers.
    # We have to do this
    df = df.applymap(str)

    # Make outs series, convert IP to Outs, then drop IP series
    whole = df['IP'].str.split(".", expand = True)[0].astype('int')
    part = df['IP'].str.split(".", expand = True)[1].astype('int')

    df['Outs'] = 3*whole + part

    df = df.drop(columns = ['IP'])

    # Sometimes my DR column gets interpreted as a float, that gets turned into a string,
    # that can't be turned directly into an int. But it can be interpreted as a float, from a string,
    # so we do that first, and then we can convert it to an int without issue.
    df['DR'] = df['DR'].astype('float')

    # Cast numerical datatypes as numbers
    df = df.astype({
        'Dec':'int',
        'DR':'int',
        'H':'int',
        'ER':'int',
        'BB':'int',
        'SO':'int',
        'HR':'int',
        'HBP':'int',
        'FIP':'float',
        'Outs':'int',
    })

    # Convert date string to date type
    df['Date'] = pd.to_datetime(df['Date'])

    # Calculate fantasy points
    df['points'] = 3*df['Dec'] - df['H'] - 2*df['ER'] - df['BB'] + df['SO'] - df['HBP'] + df['Outs']
    
    # Create quality start series QS
    for row in range(1,len(df)):
        if df.loc[row,'ER']<=3 and df.loc[row,'Outs']>=18: df.loc[row,'QS'] = 1
        else: df.loc[row,'QS'] = 0

    # Add in pitcher name and code
    df['pitcher'] = name
    df['code'] = code

    # Drop starts following 30+ days rest
    df = df[df['DR'] <30]

    # Add it to the overall pitcher data df
    pitcher_data = pitcher_data.append(df, ignore_index=True)

    # Let me know we're done with that player
    print(f'Done adding {name}')

    # Wait for 3.2 seconds, b/c baseball-reference has a 20 request/min limit
    time.sleep(3.2)

print("Done")
pitcher_data


Done adding Miles Mikolas
Done adding Chris Bassitt
Done adding Dylan Cease
Done adding Gerrit Cole
Done adding Zac Gallen
Done adding Logan Webb
Done adding Sandy Alcantara
Done


Unnamed: 0,Date,Opp,Dec,DR,H,ER,BB,SO,HR,HBP,FIP,Outs,points,QS,pitcher,code
0,2023-04-05,ATL,-1,5,9,5,1,6,1,0,2.72,18,1,0.0,Miles Mikolas,mikolmi01
1,2023-04-11,COL,0,5,10,6,2,3,3,0,5.63,15,-6,0.0,Miles Mikolas,mikolmi01
2,2023-04-16,PIT,0,4,7,2,2,4,0,0,4.86,17,8,0.0,Miles Mikolas,mikolmi01
3,2023-04-22,SEA,0,5,5,3,2,4,2,0,5.47,16,7,0.0,Miles Mikolas,mikolmi01
4,2023-04-27,SFG,1,4,4,0,2,6,0,1,4.93,19,21,1.0,Miles Mikolas,mikolmi01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,2023-08-12,NYY,1,5,5,1,2,10,0,0,3.78,27,31,1.0,Sandy Alcantara,alcansa01
198,2023-08-18,LAD,1,5,7,3,1,6,3,0,3.95,18,13,1.0,Sandy Alcantara,alcansa01
199,2023-08-23,SDP,-1,4,7,4,3,3,1,0,4.01,20,2,0.0,Sandy Alcantara,alcansa01
200,2023-08-29,TBR,-1,5,7,4,2,4,0,1,4.00,17,0,0.0,Sandy Alcantara,alcansa01


In [30]:
pitcher_data.to_csv('pitcher_data.csv',index = False)