In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
import pandas as pd

import requests
import urllib.request

import numpy as np
import math
import time
from datetime import date, datetime, timedelta

from bs4 import BeautifulSoup

start_time = time.time()


In [8]:
# How much do you want to scale back this run? Pick a number between 0 (none) and 1 (all)

t = 1


This is the section where I pull the list of probable pitchers from b-ref

### This is the section where I scrape individual starting pitchers' individual game performance

#### Things it'd be cool to add here:
All calculations NOW are going to be with respect to TODAY'S date. There is only one date.

In [10]:
# This is a dataset drawn from the probable pitchers scrape
all_starters = probables[['name', 'code']]
all_starters = all_starters.dropna()

# Initialize df
pitcher_data = pd.DataFrame()

# For every starter we have
for i in range(round(t*len(all_starters))):
    try:
        code = all_starters.loc[i,'code']
        name = all_starters.loc[i,'name']

        # Request setup
        url = f'https://www.baseball-reference.com/players/gl.fcgi?id={code}&t=p&year=2023'
        df = pd.read_html(url)
        df = pd.DataFrame(df[0])

        # # DATA CLEANING # #

        # Drop weird rows
        df = df[pd.to_numeric(df['Rk'], errors = 'coerce').notnull()]

        # Reset index after dropping some rows
        df = df.reset_index()

        # Identify desired columns to keep and drop others
        keep_cols = ['Date','Opp','Dec', 'DR',
                     'IP', 'H', 'ER', 'BB', 'SO', 'HR', 'HBP',
                     'FIP']

        df = df[keep_cols]

        # Now let's make the date column usable
        # Clean away situations where there's a double-header impacting the date
        df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

        # Extract day from date
        days = df['Date'].str.split(n=1,expand = True).loc[:,1]

        # Extract month from date
        months = df['Date'].str.split(n=1,expand = True).loc[:,0]

        # Convert month to numerical month
        months = months.str.replace('Oct','10')
        months = months.str.replace('Sep','9')
        months = months.str.replace('Aug','8')
        months = months.str.replace('Jul','7')
        months = months.str.replace('Jun','6')
        months = months.str.replace('May','5')
        months = months.str.replace('Apr','4')
        months = months.str.replace('Mar','3')

        # Create year series
        df['years'] = '2023'

        # Build date string
        df['Date'] = df['years'] + "-" + months + "-" + days

        # Drop year series
        df = df.drop(columns = ['years'])

        # Clean Decision series
        df['Dec'] = df['Dec'].str[0]

        df['Dec'] = df['Dec'].fillna('0')

        df['Dec'] = df['Dec'].str.replace('W','1')
        df['Dec'] = df['Dec'].str.replace('L','-1')
        df['Dec'] = df['Dec'].str.replace('B','-1')
        df['Dec'] = df['Dec'].str.replace('S','2')
        df['Dec'] = df['Dec'].str.replace('H','0')

        # Make everything a string before the step below. Don't worry, we'll fix this later on.
        # This is because some of our data processing functions require string inputs,
        # and our data scraper might accidentally infer some things as numbers.
        # We have to do this
        df = df.applymap(str)

        # Make outs series, convert IP to Outs, then drop IP series
        whole = df['IP'].str.split(".", expand = True)[0].astype('int')
        part = df['IP'].str.split(".", expand = True)[1].astype('int')

        df['Outs'] = 3*whole + part

        df = df.drop(columns = ['IP'])

        # Sometimes my DR column gets interpreted as a float, that gets turned into a string,
        # that can't be turned directly into an int. But it can be interpreted as a float, from a string,
        # so we do that first, and then we can convert it to an int without issue.
        df['DR'] = df['DR'].astype('float')

        # Cast numerical datatypes as numbers
        df = df.astype({
            'Dec':'int',
            'DR':'int',
            'H':'int',
            'ER':'int',
            'BB':'int',
            'SO':'int',
            'HR':'int',
            'HBP':'int',
            'FIP':'float',
            'Outs':'int',
        })

        # Convert date string to date type
        df['Date'] = pd.to_datetime(df['Date'])

        # Calculate fantasy points
        df['points'] = 3*df['Dec'] - df['H'] - 2*df['ER'] - df['BB'] + df['SO'] - df['HBP'] + df['Outs']

        # Create quality start series QS
        for row in range(1,len(df)):
            if df.loc[row,'ER']<=3 and df.loc[row,'Outs']>=18: df.loc[row,'QS'] = 1
            else: df.loc[row,'QS'] = 0

        # Add in pitcher name and code
        df['pitcher'] = name
        df['code'] = code

        # Drop starts following 30+ days rest
        df = df[df['DR'] <30]

        # Add it to the overall pitcher data df
        pitcher_data = pitcher_data.append(df, ignore_index=True)

        # Let me know we're done with that player
        print(f'Done adding {name} {i}/{len(all_starters)}')

        # Wait for 3.2 seconds, b/c baseball-reference has a 20 request/min limit
        time.sleep(3.2)
    except: pass

# Export to csv
pitcher_data.to_csv('STARTERS_pitcher_data.csv',index = False)

print("Done")

pitcher_data


Done adding Josiah Gray 0/19
Done adding Mitch Keller 1/19
Done adding Derek Law 2/19
Done adding Reese Olson 3/19
Done adding Michael King 4/19
Done adding Tanner Houck 5/19
Done adding Eury Pérez 6/19
Done adding Adrian Houser 7/19
Done adding Merrill Kelly 8/19
Done adding Kodai Senga 9/19
Done adding Nathan Eovaldi 10/19
Done adding Kevin Gausman 11/19
Done adding Aaron Civale 12/19
Done adding Kyle Bradish 13/19
Done adding Clarke Schmidt 14/19
Done adding Kenta Maeda 16/19
Done adding José Ureña 17/19
Done adding Logan Webb 18/19
Done


Unnamed: 0,Date,Opp,Dec,DR,H,ER,BB,SO,HR,HBP,FIP,Outs,points,QS,pitcher,code
0,2023-04-06,COL,-1,4,8,1,1,6,0,0,5.80,18,10,1.0,Josiah Gray,grayjo03
1,2023-04-11,LAA,-1,4,4,2,2,3,1,2,6.08,17,5,0.0,Josiah Gray,grayjo03
2,2023-04-18,BAL,-1,6,4,1,4,3,0,0,5.70,15,5,0.0,Josiah Gray,grayjo03
3,2023-04-25,NYM,1,6,4,0,1,9,0,0,4.63,18,25,1.0,Josiah Gray,grayjo03
4,2023-04-30,PIT,1,4,3,1,3,6,0,0,4.30,18,19,1.0,Josiah Gray,grayjo03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,2023-08-19,ATL,0,5,9,4,0,5,1,0,3.26,18,6,0.0,Logan Webb,webblo01
420,2023-08-25,ATL,-1,5,6,5,1,1,1,0,3.34,16,-3,0.0,Logan Webb,webblo01
421,2023-08-30,CIN,-1,4,7,2,0,6,0,0,3.27,18,10,1.0,Logan Webb,webblo01
422,2023-09-04,CHC,-1,4,5,3,1,4,1,0,3.31,20,9,1.0,Logan Webb,webblo01


### This is the notebook where I take pitcher starting performance, and create a df of "previous" performance before each start, for each pitcher.

In [11]:
df = pitcher_data

# This is me getting a df of how the given pitcher performed
# in the outing immediately before the given date
df_last = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_last1 = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(1)
    
    df_last1 = df_last1.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those 1 rows, find the averages of averageable, transpose it from series to row
    df_last1_avg = pd.DataFrame(df_last1.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_last1_avg.columns = [f"{col}_1" for col in df_last1_avg.columns]

    # Assign it to the date and code we're working with
    df_last1_avg['Date'] = date
    df_last1_avg['code'] = code

    # Append into the last few games df
    df_last = df_last.append(df_last1_avg)

# Rest the index so that we can .loc through it
df_last = df_last.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_last)):
    if df_last.loc[row,'points_1']<=-10: df_last.loc[row,'bucket_1'] = '.<=-10'
    elif df_last.loc[row,'points_1']<=0: df_last.loc[row,'bucket_1'] = '-10<.<=0'
    elif df_last.loc[row,'points_1']<=10: df_last.loc[row,'bucket_1'] = '0<.<=10'
    elif df_last.loc[row,'points_1']<=20: df_last.loc[row,'bucket_1'] = '10<.<=20'
    elif df_last.loc[row,'points_1']<=30: df_last.loc[row,'bucket_1'] = '20<.<=30'
    elif df_last.loc[row,'points_1']>30: df_last.loc[row,'bucket_1'] = '30<.'
    else: df_last.loc[row,'bucket_1'] = np.nan

df_last.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Dec_1     406 non-null    float64       
 1   DR_1      406 non-null    float64       
 2   H_1       406 non-null    float64       
 3   ER_1      406 non-null    float64       
 4   BB_1      406 non-null    float64       
 5   SO_1      406 non-null    float64       
 6   HR_1      406 non-null    float64       
 7   HBP_1     406 non-null    float64       
 8   FIP_1     406 non-null    float64       
 9   Outs_1    406 non-null    float64       
 10  points_1  406 non-null    float64       
 11  QS_1      406 non-null    float64       
 12  Date      424 non-null    datetime64[ns]
 13  code      424 non-null    object        
 14  bucket_1  406 non-null    object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 49.8+ KB


In [12]:
# Variable to determine the length of our rolling average
N=5

df_lastfew = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous N games of data
    df_lastN = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(N)
    
    df_lastN = df_lastN.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those N rows, find the averages of averageable series, transpose it from series to row
    df_lastN_avg = pd.DataFrame(df_lastN.mean(axis=0)).transpose()

    # Rename cols to indicate what it's the average of, and over how many games
    df_lastN_avg.columns = [f"{col}_{N}" for col in df_lastN_avg.columns]

    # Assign it to the date and code we're working with
    df_lastN_avg['Date'] = date
    df_lastN_avg['code'] = code

    # Append into the last few games df
    df_lastfew = df_lastfew.append(df_lastN_avg)

# Rest the index so that we can .loc through it
df_lastfew = df_lastfew.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_lastfew)):
    if df_lastfew.loc[row,f'points_{N}']<=-10: df_lastfew.loc[row,f'bucket_{N}'] = '.<=-10'
    elif df_lastfew.loc[row,f'points_{N}']<=0: df_lastfew.loc[row,f'bucket_{N}'] = '-10<.<=0'
    elif df_lastfew.loc[row,f'points_{N}']<=10: df_lastfew.loc[row,f'bucket_{N}'] = '0<.<=10'
    elif df_lastfew.loc[row,f'points_{N}']<=20: df_lastfew.loc[row,f'bucket_{N}'] = '10<.<=20'
    elif df_lastfew.loc[row,f'points_{N}']<=30: df_lastfew.loc[row,f'bucket_{N}'] = '20<.<=30'
    elif df_lastfew.loc[row,f'points_{N}']>30: df_lastfew.loc[row,f'bucket_{N}'] = '30<.'
    else: df_lastfew.loc[row,f'bucket_{N}'] = np.nan

df_lastfew.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Dec_5     406 non-null    float64       
 1   DR_5      406 non-null    float64       
 2   H_5       406 non-null    float64       
 3   ER_5      406 non-null    float64       
 4   BB_5      406 non-null    float64       
 5   SO_5      406 non-null    float64       
 6   HR_5      406 non-null    float64       
 7   HBP_5     406 non-null    float64       
 8   FIP_5     406 non-null    float64       
 9   Outs_5    406 non-null    float64       
 10  points_5  406 non-null    float64       
 11  QS_5      406 non-null    float64       
 12  Date      424 non-null    datetime64[ns]
 13  code      424 non-null    object        
 14  bucket_5  406 non-null    object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 49.8+ KB


In [13]:
df_cummulative = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_cummu = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date')
    
    df_cummu = df_cummu.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of all rows, find the averages of averageable, transpose it from series to row
    df_cummu_avg = pd.DataFrame(df_cummu.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_cummu_avg.columns = [f"{col}_all" for col in df_cummu_avg.columns]

    # Assign it to the date and code we're working with
    df_cummu_avg['Date'] = date
    df_cummu_avg['code'] = code

    # Append into the last few games df
    df_cummulative = df_cummulative.append(df_cummu_avg)

# Rest the index so that we can .loc through it
df_cummulative = df_cummulative.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_cummulative)):
    if df_cummulative.loc[row,'points_all']<=-10: df_cummulative.loc[row,'bucket_all'] = '.<=-10'
    elif df_cummulative.loc[row,'points_all']<=0: df_cummulative.loc[row,'bucket_all'] = '-10<.<=0'
    elif df_cummulative.loc[row,'points_all']<=10: df_cummulative.loc[row,'bucket_all'] = '0<.<=10'
    elif df_cummulative.loc[row,'points_all']<=20: df_cummulative.loc[row,'bucket_all'] = '10<.<=20'
    elif df_cummulative.loc[row,'points_all']<=30: df_cummulative.loc[row,'bucket_all'] = '20<.<=30'
    elif df_cummulative.loc[row,'points_all']>30: df_cummulative.loc[row,'bucket_all'] = '30<.'
    else: df_cummulative.loc[row,'bucket_all'] = np.nan

df_cummulative.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424 entries, 0 to 423
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Dec_all     406 non-null    float64       
 1   DR_all      406 non-null    float64       
 2   H_all       406 non-null    float64       
 3   ER_all      406 non-null    float64       
 4   BB_all      406 non-null    float64       
 5   SO_all      406 non-null    float64       
 6   HR_all      406 non-null    float64       
 7   HBP_all     406 non-null    float64       
 8   FIP_all     406 non-null    float64       
 9   Outs_all    406 non-null    float64       
 10  points_all  406 non-null    float64       
 11  QS_all      406 non-null    float64       
 12  Date        424 non-null    datetime64[ns]
 13  code        424 non-null    object        
 14  bucket_all  406 non-null    object        
dtypes: datetime64[ns](1), float64(12), object(2)
memory usage: 49.8+ KB


In [14]:
# Merge in all those different windows of data

# Merge in last 1 game of data
compiled_df = pd.merge(df,df_last,
                       on = ['Date','code'])

# Merge in last N games of data
compiled_df = pd.merge(compiled_df,df_lastfew,
                       on = ['Date','code'])

# Merge in all season games of data
compiled_df = pd.merge(compiled_df,df_cummulative,
                       on = ['Date','code'])

# Drop the data from that day, so we're only training on data that is known at the time before the given date
pitcher_perf = compiled_df.drop(columns = [
    'Dec', 'DR', 'H', 'ER', 'BB', 'SO',
    'HR', 'HBP', 'FIP', 'Outs', 'QS'])

# Drop NaN rows
# These rows result from asking for performance on the last 1, N games when there have been fewer than 1, N games
pitcher_perf = pitcher_perf.dropna()

pitcher_perf.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 406 entries, 1 to 423
Data columns (total 44 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        406 non-null    datetime64[ns]
 1   Opp         406 non-null    object        
 2   points      406 non-null    int64         
 3   pitcher     406 non-null    object        
 4   code        406 non-null    object        
 5   Dec_1       406 non-null    float64       
 6   DR_1        406 non-null    float64       
 7   H_1         406 non-null    float64       
 8   ER_1        406 non-null    float64       
 9   BB_1        406 non-null    float64       
 10  SO_1        406 non-null    float64       
 11  HR_1        406 non-null    float64       
 12  HBP_1       406 non-null    float64       
 13  FIP_1       406 non-null    float64       
 14  Outs_1      406 non-null    float64       
 15  points_1    406 non-null    float64       
 16  QS_1        406 non-null  

In [15]:
# Rest the index so that we can .loc through it
pitcher_perf = pitcher_perf.reset_index(drop=True)

## Put in buckets for actual performance to make the target
# Put in the performance buckets
for row in range(1,len(pitcher_perf)):
    if pitcher_perf.loc[row,'points']<=-10: pitcher_perf.loc[row,'bucket'] = '.<=-10'
    elif pitcher_perf.loc[row,'points']<=0: pitcher_perf.loc[row,'bucket'] = '-10<.<=0'
    elif pitcher_perf.loc[row,'points']<=10: pitcher_perf.loc[row,'bucket'] = '0<.<=10'
    elif pitcher_perf.loc[row,'points']<=20: pitcher_perf.loc[row,'bucket'] = '10<.<=20'
    elif pitcher_perf.loc[row,'points_1']<=30: pitcher_perf.loc[row,'bucket'] = '20<.<=30'
    elif pitcher_perf.loc[row,'points_1']>30: pitcher_perf.loc[row,'bucket'] = '30<.'
    else: pitcher_perf.loc[row,'bucket'] = np.nan

# Export to csv
pitcher_perf.to_csv('STARTERS_pitcher_previous_perf.csv',index = False)

pitcher_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        406 non-null    datetime64[ns]
 1   Opp         406 non-null    object        
 2   points      406 non-null    int64         
 3   pitcher     406 non-null    object        
 4   code        406 non-null    object        
 5   Dec_1       406 non-null    float64       
 6   DR_1        406 non-null    float64       
 7   H_1         406 non-null    float64       
 8   ER_1        406 non-null    float64       
 9   BB_1        406 non-null    float64       
 10  SO_1        406 non-null    float64       
 11  HR_1        406 non-null    float64       
 12  HBP_1       406 non-null    float64       
 13  FIP_1       406 non-null    float64       
 14  Outs_1      406 non-null    float64       
 15  points_1    406 non-null    float64       
 16  QS_1        406 non-null  

### This is the section where for each team, before a given game, I calculate their previous performance

Things it'd be cool to add here:\
In the progress printout, maybe include the number of records in each team?\
\
Some way for me to more easily pick which variables I want to choose. Like maybe have a dictionary with stats as the keys, and booleans as the values, so I can just go through, try out different combinations.

In [69]:
# Get team codes
team_codes = pd.read_csv('team_codes.csv')
codes = team_codes['code'].tolist()

## Make last 30 days df
df = pd.DataFrame()

# For every team in our list of team codes
for i in range(round(t*len(codes))):

    # Loop through teams
    team = codes[i]

    # Request setup
    url = f'https://www.baseball-reference.com/teams/tgl.cgi?team={team}&t=b&year=2023'

    team_df = pd.read_html(url)
    team_df = pd.DataFrame((team_df[0]))

    # Add team identifier
    team_df['Team'] = team

    # Append into overall team dataframe
    df = df.append(team_df)

    # Print status
    print(f'Done: {team}')
    
    # Wait for >3 seconds b/c site limits requests to 20/min
    time.sleep(3.2)

## Now we clean the data!

# Identify key columns
keep_cols = ['Team', 'Date','PA', 'AB', 'R', 'H', 'RBI', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS']

# Select for those key columns
df = df[keep_cols]

# Drop rows that contain mid-table row headers
df = df[pd.to_numeric(df['PA'], errors='coerce').notnull()]

# Now let's make the date column usable
# Clean away situations where there's a double-header impacting the date
df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

# Extract day from date
days = df['Date'].str.split(n=1,expand = True).loc[:,1]

# Extract month from date
months = df['Date'].str.split(n=1,expand = True).loc[:,0]

# Convert month to numerical month
months = months.str.replace('Oct','10')
months = months.str.replace('Sep','9')
months = months.str.replace('Aug','8')
months = months.str.replace('Jul','7')
months = months.str.replace('Jun','6')
months = months.str.replace('May','5')
months = months.str.replace('Apr','4')
months = months.str.replace('Mar','3')

# Create year series
df['years'] = '2023'

# Build date string
df['Date'] = df['years'] + "-" + months + "-" + days

# Drop year series
df = df.drop(columns = ['years'])

# Cast series as numeric/datetime
df = df.apply(pd.to_numeric, errors = 'ignore')
df['Date'] = pd.to_datetime(df['Date'])

df = df.reset_index(drop = True)

##
# Variable to determine the length of our rolling average
N=30

# Now let's make a df showing the average performance over the last N days

# Initialize our df
df_LN = pd.DataFrame()

for i in range(len(df)):

    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    team = df.loc[i,:].Team

    # Given a code/date, extract the previous game data
    # WHAT AM I GOING TO DO IF THERE IS NO PREVIOUS GAME DATA?!??!!
    df_lastN = df.loc[(df['Date'] < date) & (df['Team'] == team)].sort_values('Date').tail(N)

    # Get the average of those N rows, find the averages of averageable, transpose it from series to row
    df_lastN_avg = pd.DataFrame(df_lastN.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_lastN_avg.columns = [f"{col}_{N}" for col in df_lastN_avg.columns]

    # Assign it to the date and code we're working with
    df_lastN_avg['Date'] = date
    df_lastN_avg['code'] = team

    # Append into the last few games df
    df_LN = df_LN.append(df_lastN_avg)

# Drop errant team name column
df_LN = df_LN.drop(columns = [f'Team_{N}'])

# Relabel code column as true team name column
df_LN = df_LN.rename(columns={"code": "Team"})

df_LN = df_LN.reset_index(drop=True)

##

# Now let's make the season running total df
df_Lall = pd.DataFrame()

for i in range(len(df)):

    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    team = df.loc[i,:].Team

    # Given a code/date, extract the previous game data
    df_lastall = df.loc[(df['Date'] < date) & (df['Team'] == team)]

    # Get the average of those 5 rows, find the averages of averageable, transpose it from series to row
    df_lastall_avg = pd.DataFrame(df_lastall.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_lastall_avg.columns = [f"{col}_all" for col in df_lastall_avg.columns]

    # Assign it to the date and code we're working with
    df_lastall_avg['Date'] = date
    df_lastall_avg['code'] = team

    # Append into the last few games df
    df_Lall = df_Lall.append(df_lastall_avg)

# Drop errant team name column
df_Lall = df_Lall.drop(columns = ['Team_all'])

# Relabel code column as true team name column
df_Lall = df_Lall.rename(columns={"code": "Team"})

df_Lall = df_Lall.reset_index(drop=True)

##

# Now let's combine these to dataframes to make an overall 'previous performance'
opp_prev_perf = df_LN.merge(df_Lall, on = ['Team', 'Date'])

# Drop rows that are NaN bc there weren't any previous days of performance to pull (i.e., first day in window)
opp_prev_perf = opp_prev_perf.dropna()

# Export to csv
opp_prev_perf.to_csv('STARTERS_opp_previous_perf.csv', index = False)

print('Done')

opp_prev_perf.info()


Done: ARI
Done: ATL
Done: BAL
Done: BOS
Done: CHC
Done: CHW
Done: CIN
Done: CLE
Done: COL
Done: DET
Done: HOU
Done: KCR
Done: LAA
Done: LAD
Done: MIA
Done: MIL
Done: MIN
Done: NYM
Done: NYY
Done: OAK
Done: PHI
Done: PIT
Done: SDP
Done: SEA
Done: SFG
Done: STL
Done: TBR
Done: TEX
Done: TOR
Done: WSN
Done
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4466 entries, 1 to 4495
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   PA_30    4466 non-null   float64       
 1   AB_30    4466 non-null   float64       
 2   R_30     4466 non-null   float64       
 3   H_30     4466 non-null   float64       
 4   RBI_30   4466 non-null   float64       
 5   BB_30    4466 non-null   float64       
 6   SO_30    4466 non-null   float64       
 7   BA_30    4466 non-null   float64       
 8   OBP_30   4466 non-null   float64       
 9   SLG_30   4466 non-null   float64       
 10  OPS_30   4466 non-null   float64       
 11

In [70]:
# df of opponent performance
opp_prev_perf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4466 entries, 1 to 4495
Data columns (total 24 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   PA_30    4466 non-null   float64       
 1   AB_30    4466 non-null   float64       
 2   R_30     4466 non-null   float64       
 3   H_30     4466 non-null   float64       
 4   RBI_30   4466 non-null   float64       
 5   BB_30    4466 non-null   float64       
 6   SO_30    4466 non-null   float64       
 7   BA_30    4466 non-null   float64       
 8   OBP_30   4466 non-null   float64       
 9   SLG_30   4466 non-null   float64       
 10  OPS_30   4466 non-null   float64       
 11  Date     4466 non-null   datetime64[ns]
 12  Team     4466 non-null   object        
 13  PA_all   4466 non-null   float64       
 14  AB_all   4466 non-null   float64       
 15  R_all    4466 non-null   float64       
 16  H_all    4466 non-null   float64       
 17  RBI_all  4466 non-null   float64 

In [71]:
# df of pitcher performance
pitcher_perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        406 non-null    datetime64[ns]
 1   Opp         406 non-null    object        
 2   points      406 non-null    int64         
 3   pitcher     406 non-null    object        
 4   code        406 non-null    object        
 5   Dec_1       406 non-null    float64       
 6   DR_1        406 non-null    float64       
 7   H_1         406 non-null    float64       
 8   ER_1        406 non-null    float64       
 9   BB_1        406 non-null    float64       
 10  SO_1        406 non-null    float64       
 11  HR_1        406 non-null    float64       
 12  HBP_1       406 non-null    float64       
 13  FIP_1       406 non-null    float64       
 14  Outs_1      406 non-null    float64       
 15  points_1    406 non-null    float64       
 16  QS_1        406 non-null  

In [72]:
# df of probable pitchers
probables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     20 non-null     object
 1   name     19 non-null     object
 2   code     19 non-null     object
 3   for      19 non-null     object
 4   against  19 non-null     object
dtypes: object(5)
memory usage: 928.0+ bytes


This is where we make the combined df of all data to predict from

In [73]:
# Read in opponent performance
probables_df = probables

# Read in opponent performance
opp_perf_df = opp_prev_perf

# Extract column names
cols = opp_perf_df.columns

# Rearrange columns to get date/team at front
cols = ['Date', 'Team',
        'PA_30', 'AB_30', 'R_30', 'H_30', 'RBI_30', 'BB_30',
        'SO_30', 'BA_30','OBP_30', 'SLG_30', 'OPS_30',
        'PA_all', 'AB_all','R_all', 'H_all', 'RBI_all', 'BB_all',
        'SO_all', 'BA_all', 'OBP_all','SLG_all', 'OPS_all']

opp_perf_df = opp_perf_df[cols]

# Read in pitcher performance
pitcher_perf_df = pitcher_perf

# Perform merge
df = pitcher_perf_df.merge(opp_perf_df,
                           left_on = ['Date', 'Opp'], right_on = ['Date', 'Team'],
                           suffixes = ('_pitcher', '_opp')
                          )

df = df.dropna()

# Reorganize columns
cols = ['bucket', 'points', 'Date', 'Opp', 'pitcher', 'code', 'Dec_1', 'DR_1', 'H_1',
       'ER_1', 'BB_1', 'SO_1', 'HR_1', 'HBP_1', 'FIP_1', 'Outs_1', 'points_1',
       'QS_1', 'bucket_1', 'Dec_5', 'DR_5', 'H_5', 'ER_5', 'BB_5', 'SO_5',
       'HR_5', 'HBP_5', 'FIP_5', 'Outs_5', 'points_5', 'QS_5', 'bucket_5',
       'Dec_all', 'DR_all', 'H_all_pitcher', 'ER_all', 'BB_all_pitcher',
       'SO_all_pitcher', 'HR_all', 'HBP_all', 'FIP_all', 'Outs_all',
       'points_all', 'QS_all', 'bucket_all', 'PA_30',
       'AB_30', 'R_30', 'H_30', 'RBI_30', 'BB_30', 'SO_30', 'BA_30', 'OBP_30',
       'SLG_30', 'OPS_30', 'PA_all', 'AB_all', 'R_all', 'H_all_opp', 'RBI_all',
       'BB_all_opp', 'SO_all_opp', 'BA_all', 'OBP_all', 'SLG_all', 'OPS_all']


df = df[cols]


441

In [83]:
# Initialize df
prediction_input = pd.DataFrame()

# Extract list of starters from probables df
starters = df['code'].unique()

# Loop through all starters
for starter in starters:
    # Get most recent row of data
    most_recent = df.loc[df['code'] == starter].sort_values('Date').tail(1)

    # Put into df to use as prediciton inputs
    prediction_input = prediction_input.append(most_recent)

# Drop target columns
prediction_input = prediction_input.drop(columns=['bucket', 'points'])

df.to_csv('STARTERS_input_data.csv')

prediction_input.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 27 to 441
Data columns (total 65 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            18 non-null     datetime64[ns]
 1   Opp             18 non-null     object        
 2   pitcher         18 non-null     object        
 3   code            18 non-null     object        
 4   Dec_1           18 non-null     float64       
 5   DR_1            18 non-null     float64       
 6   H_1             18 non-null     float64       
 7   ER_1            18 non-null     float64       
 8   BB_1            18 non-null     float64       
 9   SO_1            18 non-null     float64       
 10  HR_1            18 non-null     float64       
 11  HBP_1           18 non-null     float64       
 12  FIP_1           18 non-null     float64       
 13  Outs_1          18 non-null     float64       
 14  points_1        18 non-null     float64       
 15  QS_1  

In [16]:
print("--- %s minutes ---" % round((time.time() - start_time)/60,2))

--- 46.21 minutes ---
