In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
import pandas as pd

import requests
import urllib.request

import numpy as np
import math
import time
from datetime import date, datetime, timedelta

from bs4 import BeautifulSoup

start_time = time.time()


In [2]:
# How much do you want to scale back this run? Pick a number between 0 (none) and 1 (all)

t = 1


This is the section where I pull the list of probable pitchers from b-ref

In [3]:
# Request setup
url = "https://www.baseball-reference.com/previews/"

with urllib.request.urlopen(url) as response:
    html = response.read()

soup = BeautifulSoup(html, 'html.parser')

# Get the list of probables
probables = []
probables = pd.DataFrame()

# We're going to loop through this list of matchups
matchups = soup.find_all('div', class_='game_summary nohover')

# For each matchup in the full list of matchups
for matchup in matchups:

    # Get the first pitcher's name
    p1_name = matchup.find_all('a')[3].text

    # Get the first pitcher's code
    try: p1_code = matchup.find_all('a')[3].get('href').split("/")[5].split(".")[0]
    except: p1_code = np.nan

    # Get the first pitcher's team
    team1 = matchup.find_all('strong')[0].text

    # Get the second pitcher's name
    try: p2_name = matchup.find_all('a')[4].text
    except: p2_name = np.nan

    # Get the second pitcher's code
    try: p2_code = matchup.find_all('a')[4].get('href').split("/")[5].split(".")[0]
    except: p2_code = np.nan

    # Get the second pitcher's team
    # Had to add a function that handles if there is a debut in the matchup,
    # Which would otherwise change the list of 'strong' divs in the matchup
    debut_adj = math.ceil(len(matchup.find_all('strong'))/2)
    try: team2 = matchup.find_all('strong')[debut_adj].text
    except: team2 = np.nan

    # Create a dictionary for the first pitcher and their matchup
    pitcher1 = {
        'date':date.today(),
        'name':p1_name,
        'code':p1_code,
        'for':team1,
        'against':team2
    }

    # Create a dictionary for the second pitcher and their matchup
    pitcher2 = {
        'date':date.today(),
        'name':p2_name,
        'code':p2_code,
        'for':team2,
        'against':team1
    }

    # Put both pitchers into a list of probable matchups, separately
    probables = probables.append(pitcher1, ignore_index=True)
    probables = probables.append(pitcher2, ignore_index=True)

# Export to csv
probables.to_csv('probables.csv',index = False)

probables

Unnamed: 0,date,name,code,for,against
0,2023-09-14,Josiah Gray,grayjo03,WSN,PIT
1,2023-09-14,Mitch Keller,kellemi03,PIT,WSN
2,2023-09-14,Derek Law,lawde01,CIN,DET
3,2023-09-14,Reese Olson,olsonre01,DET,CIN
4,2023-09-14,Michael King,kingmi01,NYY,BOS
5,2023-09-14,Tanner Houck,houckta01,BOS,NYY
6,2023-09-14,Eury Pérez,perezeu02,MIA,MIL
7,2023-09-14,Adrian Houser,housead01,MIL,MIA
8,2023-09-14,Merrill Kelly,kellyme01,ARI,NYM
9,2023-09-14,Kodai Senga,sengako01,NYM,ARI


### This is the section where I scrape individual starting pitchers' individual game performance

#### Things it'd be cool to add here:
Add a counter that shows the total number of pitchers logged, compared to the total number fed in, so I can see how much loss there was when we triggered the except/pass code\
\
Add in a we could try to scrape away some time by ignoring pitchchers who haven't pitched since the last time we logged? Probably won't save that much time, depending on how much time it has been.\
\
How valuable is the information we are getting from the guys who are towards the end of the list? The list of pitchers is sorted by number of starts, so cutting the bottom 20% of the list will take away 20% of the run time, but only ~10% of the data. But maybe it's important to have representation for those pitchers with only a few starts?\
\
Some way for me to more easily pick which variables I want to choose. Like maybe have a dictionary with stats as the keys, and booleans as the values, so I can just go through, try out different combinations.

In [4]:
# This is a dataset drawn manually from b-ref.
# There is probably a slick way to scrape this
# but this only has to be updated every once in a while, not all the time
# So manually works for now.
all_starters = pd.read_csv('all_starters.csv')

# Initialize df
pitcher_data = pd.DataFrame()

# For every starter we have
for i in range(round(t*len(all_starters))):
    try:
        code = all_starters.loc[i,'code']
        name = all_starters.loc[i,'name']

        # Request setup
        url = f'https://www.baseball-reference.com/players/gl.fcgi?id={code}&t=p&year=2023'
        df = pd.read_html(url)
        df = pd.DataFrame(df[0])

        # # DATA CLEANING # #

        # Drop weird rows
        df = df[pd.to_numeric(df['Rk'], errors = 'coerce').notnull()]

        # Reset index after dropping some rows
        df = df.reset_index()

        # Identify desired columns to keep and drop others
        keep_cols = ['Date','Opp','Dec', 'DR',
                     'IP', 'H', 'ER', 'BB', 'SO', 'HR', 'HBP',
                     'FIP']

        df = df[keep_cols]

        # Now let's make the date column usable
        # Clean away situations where there's a double-header impacting the date
        df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

        # Extract day from date
        days = df['Date'].str.split(n=1,expand = True).loc[:,1]

        # Extract month from date
        months = df['Date'].str.split(n=1,expand = True).loc[:,0]

        # Convert month to numerical month
        months = months.str.replace('Oct','10')
        months = months.str.replace('Sep','9')
        months = months.str.replace('Aug','8')
        months = months.str.replace('Jul','7')
        months = months.str.replace('Jun','6')
        months = months.str.replace('May','5')
        months = months.str.replace('Apr','4')
        months = months.str.replace('Mar','3')

        # Create year series
        df['years'] = '2023'

        # Build date string
        df['Date'] = df['years'] + "-" + months + "-" + days

        # Drop year series
        df = df.drop(columns = ['years'])

        # Clean Decision series
        df['Dec'] = df['Dec'].str[0]

        df['Dec'] = df['Dec'].fillna('0')

        df['Dec'] = df['Dec'].str.replace('W','1')
        df['Dec'] = df['Dec'].str.replace('L','-1')
        df['Dec'] = df['Dec'].str.replace('B','-1')
        df['Dec'] = df['Dec'].str.replace('S','2')
        df['Dec'] = df['Dec'].str.replace('H','0')

        # Make everything a string before the step below. Don't worry, we'll fix this later on.
        # This is because some of our data processing functions require string inputs,
        # and our data scraper might accidentally infer some things as numbers.
        # We have to do this
        df = df.applymap(str)

        # Make outs series, convert IP to Outs, then drop IP series
        whole = df['IP'].str.split(".", expand = True)[0].astype('int')
        part = df['IP'].str.split(".", expand = True)[1].astype('int')

        df['Outs'] = 3*whole + part

        df = df.drop(columns = ['IP'])

        # Sometimes my DR column gets interpreted as a float, that gets turned into a string,
        # that can't be turned directly into an int. But it can be interpreted as a float, from a string,
        # so we do that first, and then we can convert it to an int without issue.
        df['DR'] = df['DR'].astype('float')

        # Cast numerical datatypes as numbers
        df = df.astype({
            'Dec':'int',
            'DR':'int',
            'H':'int',
            'ER':'int',
            'BB':'int',
            'SO':'int',
            'HR':'int',
            'HBP':'int',
            'FIP':'float',
            'Outs':'int',
        })

        # Convert date string to date type
        df['Date'] = pd.to_datetime(df['Date'])

        # Calculate fantasy points
        df['points'] = 3*df['Dec'] - df['H'] - 2*df['ER'] - df['BB'] + df['SO'] - df['HBP'] + df['Outs']

        # Create quality start series QS
        for row in range(1,len(df)):
            if df.loc[row,'ER']<=3 and df.loc[row,'Outs']>=18: df.loc[row,'QS'] = 1
            else: df.loc[row,'QS'] = 0

        # Add in pitcher name and code
        df['pitcher'] = name
        df['code'] = code

        # Drop starts following 30+ days rest
        df = df[df['DR'] <30]

        # Add it to the overall pitcher data df
        pitcher_data = pitcher_data.append(df, ignore_index=True)

        # Let me know we're done with that player
        print(f'Done adding {name} {i}/{len(all_starters)}')

        # Wait for 3.2 seconds, b/c baseball-reference has a 20 request/min limit
        time.sleep(3.2)
    except: pass

# Export to csv
pitcher_data.to_csv('pitcher_data.csv',index = False)

print("Done")

pitcher_data


Done adding Miles Mikolas 0/357
Done adding Chris Bassitt 1/357
Done adding Dylan Cease 2/357
Done adding Gerrit Cole 3/357
Done adding Zac Gallen 4/357
Done adding Logan Webb 5/357
Done adding Sandy Alcantara 6/357
Done adding José Berríos 7/357
Done adding Corbin Burnes 8/357
Done adding Luis Castillo 9/357
Done adding Patrick Corbin 10/357
Done adding Kyle Gibson 11/357
Done adding Lucas Giolito 12/357
Done adding Sonny Gray 13/357
Done adding Mitch Keller 14/357
Done adding Dean Kremer 15/357
Done adding Pablo López 16/357
Done adding Jesús Luzardo 17/357
Done adding Aaron Nola 18/357
Done adding Johan Oviedo 19/357
Done adding Blake Snell 20/357
Done adding Zach Eflin 21/357
Done adding Bryce Elder 22/357
Done adding Kyle Freeland 23/357
Done adding Kevin Gausman 24/357
Done adding Logan Gilbert 25/357
Done adding Austin Gomber 26/357
Done adding Josiah Gray 27/357
Done adding Andrew Heaney 28/357
Done adding Rich Hill 29/357
Done adding Yusei Kikuchi 30/357
Done adding Lance Lynn

Done adding AJ Smith-Shawver 248/357
Done adding Randy Vásquez 249/357
Done adding Kolby Allard 250/357
Done adding Tanner Banks 251/357
Done adding Jake Bird 252/357
Done adding Kris Bubic 253/357
Done adding Austin Cox 254/357
Done adding Josh Fleming 255/357
Done adding Ian Hamilton 256/357
Done adding Emerson Hancock 257/357
Done adding Kyle Harrison 258/357
Done adding Drey Jameson 259/357
Done adding Karl Kauffmann 260/357
Done adding Trevor Kelley 261/357
Done adding Connor Overton 262/357
Done adding Denyi Reyes 263/357
Done adding Lyon Richardson 264/357
Done adding Trevor Richards 265/357
Done adding Drew Rom 266/357
Done adding Jeffrey Springs 267/357
Done adding Gavin Stone 268/357
Done adding Will Vest 269/357
Done adding Allan Winans 270/357
Done adding Ángel Zerpa 271/357
Done adding Tristan Beck 272/357
Done adding Ryan Borucki 273/357
Done adding José Butto 274/357
Done adding Taylor Clarke 275/357
Done adding José Cuas 276/357
Done adding Javy Guerra 277/357
Done addi

Unnamed: 0,Date,Opp,Dec,DR,H,ER,BB,SO,HR,HBP,FIP,Outs,points,QS,pitcher,code
0,2023-04-05,ATL,-1,5,9,5,1,6,1,0,2.72,18,1,0.0,Miles Mikolas,mikolmi01
1,2023-04-11,COL,0,5,10,6,2,3,3,0,5.63,15,-6,0.0,Miles Mikolas,mikolmi01
2,2023-04-16,PIT,0,4,7,2,2,4,0,0,4.86,17,8,0.0,Miles Mikolas,mikolmi01
3,2023-04-22,SEA,0,5,5,3,2,4,2,0,5.47,16,7,0.0,Miles Mikolas,mikolmi01
4,2023-04-27,SFG,1,4,4,0,2,6,0,1,4.93,19,21,1.0,Miles Mikolas,mikolmi01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7122,2023-06-09,NYM,0,1,2,0,0,2,0,0,4.37,5,5,0.0,Rob Zastryzny,zastrro01
7123,2023-06-13,CHC,0,3,3,3,1,2,1,0,5.01,5,-3,0.0,Rob Zastryzny,zastrro01
7124,2023-06-15,CHC,0,1,1,0,0,1,0,0,4.79,3,3,0.0,Rob Zastryzny,zastrro01
7125,2023-08-26,CHC,0,1,0,0,0,0,0,0,4.52,3,3,0.0,Rob Zastryzny,zastrro01


### This is the notebook where I take pitcher starting performance, and create a df of "previous" performance before each start, for each pitcher.

In [5]:
df = pitcher_data

# This is me getting a df of how the given pitcher performed
# in the outing immediately before the given date
df_last = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_last1 = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(1)
    
    df_last1 = df_last1.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those 1 rows, find the averages of averageable, transpose it from series to row
    df_last1_avg = pd.DataFrame(df_last1.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_last1_avg.columns = [f"{col}_1" for col in df_last1_avg.columns]

    # Assign it to the date and code we're working with
    df_last1_avg['Date'] = date
    df_last1_avg['code'] = code

    # Append into the last few games df
    df_last = df_last.append(df_last1_avg)

# Rest the index so that we can .loc through it
df_last = df_last.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_last)):
    if df_last.loc[row,'points_1']<=-10: df_last.loc[row,'bucket_1'] = '.<=-10'
    elif df_last.loc[row,'points_1']<=0: df_last.loc[row,'bucket_1'] = '-10<.<=0'
    elif df_last.loc[row,'points_1']<=10: df_last.loc[row,'bucket_1'] = '0<.<=10'
    elif df_last.loc[row,'points_1']<=20: df_last.loc[row,'bucket_1'] = '10<.<=20'
    elif df_last.loc[row,'points_1']<=30: df_last.loc[row,'bucket_1'] = '20<.<=30'
    elif df_last.loc[row,'points_1']>30: df_last.loc[row,'bucket_1'] = '30<.'
    else: df_last.loc[row,'bucket_1'] = np.nan

df_last.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7127 entries, 0 to 7126
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Dec_1     6778 non-null   float64       
 1   DR_1      6778 non-null   float64       
 2   H_1       6778 non-null   float64       
 3   ER_1      6778 non-null   float64       
 4   BB_1      6778 non-null   float64       
 5   SO_1      6778 non-null   float64       
 6   HR_1      6778 non-null   float64       
 7   HBP_1     6778 non-null   float64       
 8   FIP_1     6778 non-null   float64       
 9   Outs_1    6778 non-null   float64       
 10  points_1  6778 non-null   float64       
 11  QS_1      6778 non-null   float64       
 12  Date      7127 non-null   datetime64[ns]
 13  code      7127 non-null   object        
 14  bucket_1  6778 non-null   object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 835.3+ KB


In [6]:
# Variable to determine the length of our rolling average
N=5

df_lastfew = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous N games of data
    df_lastN = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(N)
    
    df_lastN = df_lastN.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those N rows, find the averages of averageable series, transpose it from series to row
    df_lastN_avg = pd.DataFrame(df_lastN.mean(axis=0)).transpose()

    # Rename cols to indicate what it's the average of, and over how many games
    df_lastN_avg.columns = [f"{col}_{N}" for col in df_lastN_avg.columns]

    # Assign it to the date and code we're working with
    df_lastN_avg['Date'] = date
    df_lastN_avg['code'] = code

    # Append into the last few games df
    df_lastfew = df_lastfew.append(df_lastN_avg)

# Rest the index so that we can .loc through it
df_lastfew = df_lastfew.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_lastfew)):
    if df_lastfew.loc[row,f'points_{N}']<=-10: df_lastfew.loc[row,f'bucket_{N}'] = '.<=-10'
    elif df_lastfew.loc[row,f'points_{N}']<=0: df_lastfew.loc[row,f'bucket_{N}'] = '-10<.<=0'
    elif df_lastfew.loc[row,f'points_{N}']<=10: df_lastfew.loc[row,f'bucket_{N}'] = '0<.<=10'
    elif df_lastfew.loc[row,f'points_{N}']<=20: df_lastfew.loc[row,f'bucket_{N}'] = '10<.<=20'
    elif df_lastfew.loc[row,f'points_{N}']<=30: df_lastfew.loc[row,f'bucket_{N}'] = '20<.<=30'
    elif df_lastfew.loc[row,f'points_{N}']>30: df_lastfew.loc[row,f'bucket_{N}'] = '30<.'
    else: df_lastfew.loc[row,f'bucket_{N}'] = np.nan

df_lastfew.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7127 entries, 0 to 7126
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Dec_5     6778 non-null   float64       
 1   DR_5      6778 non-null   float64       
 2   H_5       6778 non-null   float64       
 3   ER_5      6778 non-null   float64       
 4   BB_5      6778 non-null   float64       
 5   SO_5      6778 non-null   float64       
 6   HR_5      6778 non-null   float64       
 7   HBP_5     6778 non-null   float64       
 8   FIP_5     6778 non-null   float64       
 9   Outs_5    6778 non-null   float64       
 10  points_5  6778 non-null   float64       
 11  QS_5      6778 non-null   float64       
 12  Date      7127 non-null   datetime64[ns]
 13  code      7127 non-null   object        
 14  bucket_5  6778 non-null   object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 835.3+ KB


In [7]:
df_cummulative = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_cummu = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date')
    
    df_cummu = df_cummu.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of all rows, find the averages of averageable, transpose it from series to row
    df_cummu_avg = pd.DataFrame(df_cummu.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_cummu_avg.columns = [f"{col}_all" for col in df_cummu_avg.columns]

    # Assign it to the date and code we're working with
    df_cummu_avg['Date'] = date
    df_cummu_avg['code'] = code

    # Append into the last few games df
    df_cummulative = df_cummulative.append(df_cummu_avg)

# Rest the index so that we can .loc through it
df_cummulative = df_cummulative.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_cummulative)):
    if df_cummulative.loc[row,'points_all']<=-10: df_cummulative.loc[row,'bucket_all'] = '.<=-10'
    elif df_cummulative.loc[row,'points_all']<=0: df_cummulative.loc[row,'bucket_all'] = '-10<.<=0'
    elif df_cummulative.loc[row,'points_all']<=10: df_cummulative.loc[row,'bucket_all'] = '0<.<=10'
    elif df_cummulative.loc[row,'points_all']<=20: df_cummulative.loc[row,'bucket_all'] = '10<.<=20'
    elif df_cummulative.loc[row,'points_all']<=30: df_cummulative.loc[row,'bucket_all'] = '20<.<=30'
    elif df_cummulative.loc[row,'points_all']>30: df_cummulative.loc[row,'bucket_all'] = '30<.'
    else: df_cummulative.loc[row,'bucket_all'] = np.nan

df_cummulative.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7127 entries, 0 to 7126
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Dec_all     6778 non-null   float64       
 1   DR_all      6778 non-null   float64       
 2   H_all       6778 non-null   float64       
 3   ER_all      6778 non-null   float64       
 4   BB_all      6778 non-null   float64       
 5   SO_all      6778 non-null   float64       
 6   HR_all      6778 non-null   float64       
 7   HBP_all     6778 non-null   float64       
 8   FIP_all     6778 non-null   float64       
 9   Outs_all    6778 non-null   float64       
 10  points_all  6778 non-null   float64       
 11  QS_all      6778 non-null   float64       
 12  Date        7127 non-null   datetime64[ns]
 13  code        7127 non-null   object        
 14  bucket_all  6778 non-null   object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 835.3+ KB


In [8]:
# Merge in all those different windows of data

# Merge in last 1 game of data
compiled_df = pd.merge(df,df_last,
                       on = ['Date','code'])

# Merge in last N games of data
compiled_df = pd.merge(compiled_df,df_lastfew,
                       on = ['Date','code'])

# Merge in all season games of data
compiled_df = pd.merge(compiled_df,df_cummulative,
                       on = ['Date','code'])

# Drop the data from that day, so we're only training on data that is known at the time before the given date
pitcher_perf = compiled_df.drop(columns = [
    'Dec', 'DR', 'H', 'ER', 'BB', 'SO',
    'HR', 'HBP', 'FIP', 'Outs', 'QS'])

# Drop NaN rows
# These rows result from asking for performance on the last 1, N games when there have been fewer than 1, N games
pitcher_perf = pitcher_perf.dropna()

pitcher_perf.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6792 entries, 1 to 7140
Data columns (total 44 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        6792 non-null   datetime64[ns]
 1   Opp         6792 non-null   object        
 2   points      6792 non-null   int64         
 3   pitcher     6792 non-null   object        
 4   code        6792 non-null   object        
 5   Dec_1       6792 non-null   float64       
 6   DR_1        6792 non-null   float64       
 7   H_1         6792 non-null   float64       
 8   ER_1        6792 non-null   float64       
 9   BB_1        6792 non-null   float64       
 10  SO_1        6792 non-null   float64       
 11  HR_1        6792 non-null   float64       
 12  HBP_1       6792 non-null   float64       
 13  FIP_1       6792 non-null   float64       
 14  Outs_1      6792 non-null   float64       
 15  points_1    6792 non-null   float64       
 16  QS_1        6792 non-nul

In [9]:
# Rest the index so that we can .loc through it
pitcher_perf = pitcher_perf.reset_index(drop=True)

## Put in buckets for actual performance to make the target
# Put in the performance buckets
for row in range(1,len(pitcher_perf)):
    if pitcher_perf.loc[row,'points']<=-10: pitcher_perf.loc[row,'bucket'] = '.<=-10'
    elif pitcher_perf.loc[row,'points']<=0: pitcher_perf.loc[row,'bucket'] = '-10<.<=0'
    elif pitcher_perf.loc[row,'points']<=10: pitcher_perf.loc[row,'bucket'] = '0<.<=10'
    elif pitcher_perf.loc[row,'points']<=20: pitcher_perf.loc[row,'bucket'] = '10<.<=20'
    elif pitcher_perf.loc[row,'points_1']<=30: pitcher_perf.loc[row,'bucket'] = '20<.<=30'
    elif pitcher_perf.loc[row,'points_1']>30: pitcher_perf.loc[row,'bucket'] = '30<.'
    else: pitcher_perf.loc[row,'bucket'] = np.nan

# Export to csv
pitcher_perf.to_csv('pitcher_previous_perf.csv',index = False)

pitcher_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6792 entries, 0 to 6791
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        6792 non-null   datetime64[ns]
 1   Opp         6792 non-null   object        
 2   points      6792 non-null   int64         
 3   pitcher     6792 non-null   object        
 4   code        6792 non-null   object        
 5   Dec_1       6792 non-null   float64       
 6   DR_1        6792 non-null   float64       
 7   H_1         6792 non-null   float64       
 8   ER_1        6792 non-null   float64       
 9   BB_1        6792 non-null   float64       
 10  SO_1        6792 non-null   float64       
 11  HR_1        6792 non-null   float64       
 12  HBP_1       6792 non-null   float64       
 13  FIP_1       6792 non-null   float64       
 14  Outs_1      6792 non-null   float64       
 15  points_1    6792 non-null   float64       
 16  QS_1        6792 non-nul

### This is the section where for each team, before a given game, I calculate their previous performance

Things it'd be cool to add here:\
In the progress printout, maybe include the number of records in each team?\
\
Some way for me to more easily pick which variables I want to choose. Like maybe have a dictionary with stats as the keys, and booleans as the values, so I can just go through, try out different combinations.

In [10]:
# Get team codes
team_codes = pd.read_csv('team_codes.csv')
codes = team_codes['code'].tolist()

## Make last 30 days df
df = pd.DataFrame()

# For every team in our list of team codes
for i in range(round(t*len(codes))):

    # Loop through teams
    team = codes[i]

    # Request setup
    url = f'https://www.baseball-reference.com/teams/tgl.cgi?team={team}&t=b&year=2023'

    team_df = pd.read_html(url)
    team_df = pd.DataFrame((team_df[0]))

    # Add team identifier
    team_df['Team'] = team

    # Append into overall team dataframe
    df = df.append(team_df)

    # Print status
    print(f'Done: {team}')
    
    # Wait for >3 seconds b/c site limits requests to 20/min
    time.sleep(3.2)

## Now we clean the data!

# Identify key columns
keep_cols = ['Team', 'Date','PA', 'AB', 'R', 'H', 'RBI', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS']

# Select for those key columns
df = df[keep_cols]

# Drop rows that contain mid-table row headers
df = df[pd.to_numeric(df['PA'], errors='coerce').notnull()]

# Now let's make the date column usable
# Clean away situations where there's a double-header impacting the date
df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

# Extract day from date
days = df['Date'].str.split(n=1,expand = True).loc[:,1]

# Extract month from date
months = df['Date'].str.split(n=1,expand = True).loc[:,0]

# Convert month to numerical month
months = months.str.replace('Oct','10')
months = months.str.replace('Sep','9')
months = months.str.replace('Aug','8')
months = months.str.replace('Jul','7')
months = months.str.replace('Jun','6')
months = months.str.replace('May','5')
months = months.str.replace('Apr','4')
months = months.str.replace('Mar','3')

# Create year series
df['years'] = '2023'

# Build date string
df['Date'] = df['years'] + "-" + months + "-" + days

# Drop year series
df = df.drop(columns = ['years'])

# Cast series as numeric/datetime
df = df.apply(pd.to_numeric, errors = 'ignore')
df['Date'] = pd.to_datetime(df['Date'])

df = df.reset_index(drop = True)

##
# Variable to determine the length of our rolling average
N=30

# Now let's make a df showing the average performance over the last N days

# Initialize our df
df_LN = pd.DataFrame()

for i in range(len(df)):

    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    team = df.loc[i,:].Team

    # Given a code/date, extract the previous game data
    # WHAT AM I GOING TO DO IF THERE IS NO PREVIOUS GAME DATA?!??!!
    df_lastN = df.loc[(df['Date'] < date) & (df['Team'] == team)].sort_values('Date').tail(N)

    # Get the average of those N rows, find the averages of averageable, transpose it from series to row
    df_lastN_avg = pd.DataFrame(df_lastN.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_lastN_avg.columns = [f"{col}_{N}" for col in df_lastN_avg.columns]

    # Assign it to the date and code we're working with
    df_lastN_avg['Date'] = date
    df_lastN_avg['code'] = team

    # Append into the last few games df
    df_LN = df_LN.append(df_lastN_avg)

# Drop errant team name column
df_LN = df_LN.drop(columns = [f'Team_{N}'])

# Relabel code column as true team name column
df_LN = df_LN.rename(columns={"code": "Team"})

df_LN = df_LN.reset_index(drop=True)

##

# Now let's make the season running total df
df_Lall = pd.DataFrame()

for i in range(len(df)):

    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    team = df.loc[i,:].Team

    # Given a code/date, extract the previous game data
    df_lastall = df.loc[(df['Date'] < date) & (df['Team'] == team)]

    # Get the average of those 5 rows, find the averages of averageable, transpose it from series to row
    df_lastall_avg = pd.DataFrame(df_lastall.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_lastall_avg.columns = [f"{col}_all" for col in df_lastall_avg.columns]

    # Assign it to the date and code we're working with
    df_lastall_avg['Date'] = date
    df_lastall_avg['code'] = team

    # Append into the last few games df
    df_Lall = df_Lall.append(df_lastall_avg)

# Drop errant team name column
df_Lall = df_Lall.drop(columns = ['Team_all'])

# Relabel code column as true team name column
df_Lall = df_Lall.rename(columns={"code": "Team"})

df_Lall = df_Lall.reset_index(drop=True)

##

# Now let's combine these to dataframes to make an overall 'previous performance'
opp_prev_perf = df_LN.merge(df_Lall, on = ['Team', 'Date'])

# Drop rows that are NaN bc there weren't any previous days of performance to pull (i.e., first day in window)
opp_prev_perf = opp_prev_perf.dropna()

# Export to csv
opp_prev_perf.to_csv('opp_previous_perf.csv', index = False)

print('Done')

opp_prev_perf.info()


Done: ARI
Done: ATL
Done: BAL
Done: BOS
Done: CHC
Done: CHW
Done: CIN
Done: CLE
Done: COL
Done: DET
Done: HOU
Done: KCR
Done: LAA
Done: LAD
Done: MIA
Done: MIL
Done: MIN
Done: NYM
Done: NYY
Done: OAK
Done: PHI
Done: PIT
Done: SDP
Done: SEA
Done: SFG
Done: STL
Done: TBR
Done: TEX
Done: TOR
Done: WSN
Done
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4466 entries, 1 to 4495
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   PA_30    4466 non-null   float64       
 1   AB_30    4466 non-null   float64       
 2   R_30     4466 non-null   float64       
 3   H_30     4466 non-null   float64       
 4   RBI_30   4466 non-null   float64       
 5   BB_30    4466 non-null   float64       
 6   SO_30    4466 non-null   float64       
 7   BA_30    4466 non-null   float64       
 8   OBP_30   4466 non-null   float64       
 9   SLG_30   4466 non-null   float64       
 10  OPS_30   4466 non-null   float64       
 11

In [11]:
# df of opponent performance
opp_prev_perf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4466 entries, 1 to 4495
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   PA_30    4466 non-null   float64       
 1   AB_30    4466 non-null   float64       
 2   R_30     4466 non-null   float64       
 3   H_30     4466 non-null   float64       
 4   RBI_30   4466 non-null   float64       
 5   BB_30    4466 non-null   float64       
 6   SO_30    4466 non-null   float64       
 7   BA_30    4466 non-null   float64       
 8   OBP_30   4466 non-null   float64       
 9   SLG_30   4466 non-null   float64       
 10  OPS_30   4466 non-null   float64       
 11  Date     4466 non-null   datetime64[ns]
 12  Team     4466 non-null   object        
 13  PA_all   4466 non-null   float64       
 14  AB_all   4466 non-null   float64       
 15  R_all    4466 non-null   float64       
 16  H_all    4466 non-null   float64       
 17  RBI_all  4466 non-null   float64 

In [12]:
# df of pitcher performance
pitcher_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6792 entries, 0 to 6791
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        6792 non-null   datetime64[ns]
 1   Opp         6792 non-null   object        
 2   points      6792 non-null   int64         
 3   pitcher     6792 non-null   object        
 4   code        6792 non-null   object        
 5   Dec_1       6792 non-null   float64       
 6   DR_1        6792 non-null   float64       
 7   H_1         6792 non-null   float64       
 8   ER_1        6792 non-null   float64       
 9   BB_1        6792 non-null   float64       
 10  SO_1        6792 non-null   float64       
 11  HR_1        6792 non-null   float64       
 12  HBP_1       6792 non-null   float64       
 13  FIP_1       6792 non-null   float64       
 14  Outs_1      6792 non-null   float64       
 15  points_1    6792 non-null   float64       
 16  QS_1        6792 non-nul

In [13]:
# df of probable pitchers
probables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     20 non-null     object
 1   name     19 non-null     object
 2   code     19 non-null     object
 3   for      19 non-null     object
 4   against  19 non-null     object
dtypes: object(5)
memory usage: 928.0+ bytes


This is where we make the combined df of all data to train on

In [17]:
# Read in opponent performance
probables_df = probables

# Read in opponent performance
opp_perf_df = opp_prev_perf

# Extract column names
cols = opp_perf_df.columns

# Rearrange columns to get date/team at front
cols = ['Date', 'Team',
        'PA_30', 'AB_30', 'R_30', 'H_30', 'RBI_30', 'BB_30',
        'SO_30', 'BA_30','OBP_30', 'SLG_30', 'OPS_30',
        'PA_all', 'AB_all','R_all', 'H_all', 'RBI_all', 'BB_all',
        'SO_all', 'BA_all', 'OBP_all','SLG_all', 'OPS_all']

opp_perf_df = opp_perf_df[cols]

# Read in pitcher performance
pitcher_perf_df = pitcher_perf

# Perform merge
df = pitcher_perf_df.merge(opp_perf_df,
                           left_on = ['Date', 'Opp'], right_on = ['Date', 'Team'],
                           suffixes = ('_pitcher', '_opp')
                          )

df = df.dropna()

# Reorganize columns
cols = ['bucket', 'points', 'Date', 'Opp', 'pitcher', 'code', 'Dec_1', 'DR_1', 'H_1',
       'ER_1', 'BB_1', 'SO_1', 'HR_1', 'HBP_1', 'FIP_1', 'Outs_1', 'points_1',
       'QS_1', 'bucket_1', 'Dec_5', 'DR_5', 'H_5', 'ER_5', 'BB_5', 'SO_5',
       'HR_5', 'HBP_5', 'FIP_5', 'Outs_5', 'points_5', 'QS_5', 'bucket_5',
       'Dec_all', 'DR_all', 'H_all_pitcher', 'ER_all', 'BB_all_pitcher',
       'SO_all_pitcher', 'HR_all', 'HBP_all', 'FIP_all', 'Outs_all',
       'points_all', 'QS_all', 'bucket_all', 'PA_30',
       'AB_30', 'R_30', 'H_30', 'RBI_30', 'BB_30', 'SO_30', 'BA_30', 'OBP_30',
       'SLG_30', 'OPS_30', 'PA_all', 'AB_all', 'R_all', 'H_all_opp', 'RBI_all',
       'BB_all_opp', 'SO_all_opp', 'BA_all', 'OBP_all', 'SLG_all', 'OPS_all']


df = df[cols]

df.to_csv('training_data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7397 entries, 1 to 7397
Data columns (total 67 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   bucket          7397 non-null   object        
 1   points          7397 non-null   int64         
 2   Date            7397 non-null   datetime64[ns]
 3   Opp             7397 non-null   object        
 4   pitcher         7397 non-null   object        
 5   code            7397 non-null   object        
 6   Dec_1           7397 non-null   float64       
 7   DR_1            7397 non-null   float64       
 8   H_1             7397 non-null   float64       
 9   ER_1            7397 non-null   float64       
 10  BB_1            7397 non-null   float64       
 11  SO_1            7397 non-null   float64       
 12  HR_1            7397 non-null   float64       
 13  HBP_1           7397 non-null   float64       
 14  FIP_1           7397 non-null   float64       
 15  Outs

In [16]:
print("--- %s minutes ---" % round((time.time() - start_time)/60,2))

--- 46.21 minutes ---
