In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
import pandas as pd

import requests
import urllib.request

import numpy as np
import math
import time
from datetime import date, datetime, timedelta

from bs4 import BeautifulSoup

import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelBinarizer
from sklearn.metrics import accuracy_score
import tensorflow as tf

start_time = time.time()


# Select your data

In [None]:
# How much do you want to scale back this run? Pick a number between 0 (none) and 1 (all)
t = 1

# Do you want to scrape new data (y/n)? If so, it'll take approx t*30 minutes
pit_update = 'n'

# Do you want to scrape new data (y/n)? If so, it'll take approx 3 minutes
opp_update = 'n'

# How wide do you want the predicted performance buckets to be?
bucket_width = 12

# How many games do you want to include in a pitcher's "recent performance" (think, rolling avg)
N=5

# How many games do you want to include in an opposing team's "recent performance" (think, rolling avg)
M=30

# Which PITCHER stats do you want inlcluded
# This is in addition to required stats, which have already been selected
# 1 for yes, 0 for no

stat_selector = {
 'index': 0,
 'Rk': 0,
 'Gcar': 0,
 'Gtm': 0,
 'Tm': 0,
 'Rslt': 0,
 'Inngs': 0,
 'R': 0,
 'ERA': 0,
 'FIP': 1,
 'BF': 0,
 'Pit': 0,
 'Str': 0,
 'StL': 0,
 'StS': 0,
 'GB': 0,
 'FB': 0,
 'LD': 0,
 'PU': 0,
 'Unk': 0,
 'GSc': 1,
 'IR': 0,
 'IS': 0,
 'SB': 0,
 'CS': 0,
 'PO': 0,
 'AB': 0,
 '2B': 1,
 '3B': 0,
 'IBB': 0,
 'GDP': 0,
 'SF': 0,
 'ROE': 0,
 'aLI': 0,
 'WPA': 0,
 'acLI': 0,
 'cWPA': 0,
 'RE24': 0,
 'DFS(DK)': 0,
 'DFS(FD)': 0,
 'Entered': 0,
 'Exited': 0,
 'Outs': 0}

# Which OPPOSING TEAM stats do you want?
opp_stat_selector = {'Rk': 0,
 'Gtm': 0,
 'Opp': 0,
 'Rslt': 0,
 'AB': 0,
 'R': 1,
 'H': 1,
 '2B': 0,
 '3B': 0,
 'HR': 1,
 'RBI': 0,
 'BB': 1,
 'IBB': 0,
 'SO': 1,
 'HBP': 0,
 'SH': 0,
 'SF': 0,
 'ROE': 0,
 'GDP': 0,
 'SB': 0,
 'CS': 0,
 'BA': 0,
 'OBP': 1,
 'SLG': 1,
 'OPS': 0,
 'LOB': 1,
 '#': 0,
 'Thr': 0,
 'Opp. Starter (GmeSc)': 0}


In [None]:
# We need a full list of datatypes for all potential data selections, so we can assign dtype appropriately
pitcher_data_cols_dtypes = {
 'index': 'int',
 'Rk': 'int',
 'Gcar': 'int',
 'Gtm': 'int',
#  'Date': 'datetime64[ns]', taking out this one b/c dates are weird. Handling elsewhere.
 'Tm': 'object',
 'Unnamed: 5': 'object',
 'Opp': 'object',
 'Rslt': 'object',
 'Inngs': 'object',
 'Dec': 'int',
 'DR': 'int',
 'H': 'int',
 'R': 'int',
 'ER': 'int',
 'BB': 'int',
 'SO': 'int',
 'HR': 'int',
 'HBP': 'int',
 'ERA': 'float',
 'FIP': 'float',
 'BF': 'int',
 'Pit': 'int',
 'Str': 'int',
 'StL': 'int',
 'StS': 'int',
 'GB': 'int',
 'FB': 'int',
 'LD': 'int',
 'PU': 'int',
 'Unk': 'int',
 'GSc': 'int',
 'IR': 'int',
 'IS': 'int',
 'SB': 'int',
 'CS': 'int',
 'PO': 'int',
 'AB': 'int',
 '2B': 'int',
 '3B': 'int',
 'IBB': 'int',
 'GDP': 'int',
 'SF': 'int',
 'ROE': 'int',
 'aLI': 'float',
 'WPA': 'float',
 'acLI': 'float',
 'cWPA': 'float',
 'RE24': 'float',
 'DFS(DK)': 'float',
 'DFS(FD)': 'float',
 'Entered': 'object',
 'Exited': 'object',
 'Outs': 'int'}

opp_data_cols_dtypes ={
 'Rk': 'int',
 'Gtm': 'int',
#  'Date': 'int', commenting this out b/c we take care of it later
 'Unnamed: 3': 'object',
 'Opp': 'object',
 'Rslt': 'object',
 'PA': 'int',
 'AB': 'int',
 'R': 'int',
 'H': 'int',
 '2B': 'int',
 '3B': 'int',
 'HR': 'int',
 'RBI': 'int',
 'BB': 'int',
 'IBB': 'int',
 'SO': 'int',
 'HBP': 'int',
 'SH': 'int',
 'SF': 'int',
 'ROE': 'int',
 'GDP': 'int',
 'SB': 'int',
 'CS': 'int',
 'BA': 'float',
 'OBP': 'float',
 'SLG': 'float',
 'OPS': 'float',
 'LOB': 'int',
 '#': 'int',
 'Thr': 'object',
 'Opp. Starter (GmeSc)': 'object',
 'Team': 'object'}

### This is the section where I scrape individual starting pitchers' individual game performance

#### Things to consider:
Add a counter that shows the total number of pitchers logged, compared to the total number fed in, so I can see how much loss there was when we triggered the except/pass code\
\
How valuable is the information we are getting from the guys who are towards the end of the list? The list of pitchers is sorted by number of starts, so cutting the bottom 20% of the list will take away 20% of the run time, but only ~10% of the data. But maybe it's important to have representation for those pitchers with only a few starts? Much of the time, it's those rando pitchers that we're needing to judge the most, b/c they're the most available.\


In [None]:
# This is a dataset drawn manually from b-ref.
# There is probably a slick way to scrape this
# but this only has to be updated every once in a while, not all the time
# So manually works for now.
all_starters = pd.read_csv('all_starters.csv')

all_starters = pd.read_csv('all_starters.csv')


## Identify desired columns to keep and drop others
# Build list of wanted stats
reqd_cols = ['Date','Opp','Dec', 'DR','IP', 'H', 'ER', 'BB', 'SO', 'HR', 'HBP']

wanted_cols = []

for key, value in stat_selector.items():
    if value == 1: wanted_cols.append(key)

# Combine with list of required stats
keep_cols = reqd_cols + wanted_cols


if pit_update == 'y':
    # Initialize df
    pitcher_data = pd.DataFrame()

    for i in range(round(t*len(all_starters))):
        try:
            code = all_starters.loc[i,'code']
            name = all_starters.loc[i,'name']

            # Request setup
            url = f'https://www.baseball-reference.com/players/gl.fcgi?id={code}&t=p&year=2023'
            df = pd.read_html(url)
            df = pd.DataFrame(df[0])

            # # DATA CLEANING # #

            # Drop rows that are mid-table column names
            df = df[pd.to_numeric(df['Rk'], errors = 'coerce').notnull()]

            # Reset index after dropping some rows
            df = df.reset_index()



            # Select for selected cols
            df = df[keep_cols]

            # Now let's make the date column usable
            # Clean away situations where there's a double-header impacting the date
            df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

            # Extract day from date
            days = df['Date'].str.split(n=1,expand = True).loc[:,1]

            # Extract month from date
            months = df['Date'].str.split(n=1,expand = True).loc[:,0]

            # Convert month to numerical month
            months = months.str.replace('Oct','10')
            months = months.str.replace('Sep','9')
            months = months.str.replace('Aug','8')
            months = months.str.replace('Jul','7')
            months = months.str.replace('Jun','6')
            months = months.str.replace('May','5')
            months = months.str.replace('Apr','4')
            months = months.str.replace('Mar','3')

            # Create year series
            df['years'] = '2023'

            # Build date string
            df['Date'] = df['years'] + "-" + months + "-" + days

            # Drop year series
            df = df.drop(columns = ['years'])

            # Clean Decision series
            df['Dec'] = df['Dec'].str[0]

            df['Dec'] = df['Dec'].fillna('0')

            df['Dec'] = df['Dec'].str.replace('W','1')
            df['Dec'] = df['Dec'].str.replace('L','-1')
            df['Dec'] = df['Dec'].str.replace('B','-1')
            df['Dec'] = df['Dec'].str.replace('S','2')
            df['Dec'] = df['Dec'].str.replace('H','0')

            # Make everything a string before the step below. Don't worry, we'll fix this later on.
            # This is because some of our data processing functions require string inputs,
            # and our data scraper might accidentally infer some things as numbers.
            # We have to do this
            df = df.applymap(str)

            # Make outs series, convert IP to Outs, then drop IP series
            whole = df['IP'].str.split(".", expand = True)[0].astype('int')
            part = df['IP'].str.split(".", expand = True)[1].astype('int')

            df['Outs'] = 3*whole + part

            df = df.drop(columns = ['IP'])

            # Sometimes my DR column gets interpreted as a float, that gets turned into a string,
            # that can't be turned directly into an int. But it can be interpreted as a float, from a string,
            # so we do that first, and then we can convert it to an int without issue.
            df['DR'] = df['DR'].astype('float')

            # Make a dictionary that will tell me what to cast each type as
            dtype_applier = {}
            for key, value in pitcher_data_cols_dtypes.items():
                if key in keep_cols: dtype_applier[key]=value

            # Cast numerical datatypes as numbers
            df = df.astype(dtype_applier)

            # Convert date string to date type
            df['Date'] = pd.to_datetime(df['Date'])

            # Calculate fantasy points
            df['points'] = 3*df['Dec'] - df['H'] - 2*df['ER'] - df['BB'] + df['SO'] - df['HBP'] + df['Outs']

            # Create quality start series QS
            for row in range(1,len(df)):
                if df.loc[row,'ER']<=3 and df.loc[row,'Outs']>=18: df.loc[row,'QS'] = 1
                else: df.loc[row,'QS'] = 0

            # Add in pitcher name and code
            df['pitcher'] = name
            df['code'] = code

            # Drop starts following 30+ days rest
            df = df[df['DR'] <30]

            # Add it to the overall pitcher data df
            pitcher_data = pitcher_data.append(df, ignore_index=True)

            # Let me know we're done with that player
            print(f'Done adding {name} {i}/{len(all_starters)}')

            # Wait for 3.5 seconds, b/c baseball-reference has a 20 request/min limit
            time.sleep(3.5)
        except: print(f'ERROR with {name} {i}/{len(all_starters)}')


    # Export to csv
    pitcher_data.to_csv('pitcher_data.csv',index = False)

    print("Done")

# If we aren't pulling/updating data, just load whatever we had from the last time we ran it
else: pitcher_data = pd.read_csv('pitcher_data.csv')

pitcher_data['Date'] = pd.to_datetime(pitcher_data['Date'])

pitcher_data.info()

pitcher_data


### This is the notebook where I take pitcher starting performance, and create a df of "previous" performance before each start, for each pitcher.

In [None]:
# Calculate performance buckets
N = math.ceil(30/bucket_width)+1
th = [bucket_width*n for n in list(range(N+1))]

# Build ordered list of bucket labels
bucket_labels = ['.<=0']

for j in range(len(th)-1):
    bucket = f'{th[j]}<.<={th[j+1]}'
    bucket_labels.append(bucket)

print(bucket_labels)


In [None]:
df = pitcher_data

# This is me getting a df of how the given pitcher performed
# in the outing immediately before the given date
df_last = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_last1 = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(1)
    
    df_last1 = df_last1.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those 1 rows, find the averages of averageable, transpose it from series to row
    df_last1_avg = pd.DataFrame(df_last1.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_last1_avg.columns = [f"{col}_1" for col in df_last1_avg.columns]

    # Assign it to the date and code we're working with
    df_last1_avg['Date'] = date
    df_last1_avg['code'] = code

    # Append into the last few games df
    df_last = df_last.append(df_last1_avg)

# Rest the index so that we can .loc through it
df_last = df_last.reset_index(drop=True)

#  Put in the performance buckets
for row in range(1,len(df_last)):
    for j in range(len(th)-1):
        if (df_last.loc[row,'points_1']>th[j]) and (df_last.loc[row,'points_1']<=th[j+1]):
            df_last.loc[row,'bucket_1'] = f'{th[j]}<.<={th[j+1]}'
        if df_last.loc[row,'points_1']<=0: df_last.loc[row,'bucket_1'] = '.<=0'

df_last.info()


In [None]:
df_lastfew = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous N games of data
    df_lastN = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date').tail(N)
    
    df_lastN = df_lastN.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of those N rows, find the averages of averageable series, transpose it from series to row
    df_lastN_avg = pd.DataFrame(df_lastN.mean(axis=0)).transpose()

    # Rename cols to indicate what it's the average of, and over how many games
    df_lastN_avg.columns = [f'{col}_{N}' for col in df_lastN_avg.columns]

    # Assign it to the date and code we're working with
    df_lastN_avg['Date'] = date
    df_lastN_avg['code'] = code

    # Append into the last few games df
    df_lastfew = df_lastfew.append(df_lastN_avg)

# Rest the index so that we can .loc through it
df_lastfew = df_lastfew.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_lastfew)):
    for j in range(len(th)-1):
        if (df_lastfew.loc[row,f'points_{N}']>th[j]) and (df_lastfew.loc[row,f'points_{N}']<=th[j+1]):
            df_lastfew.loc[row,f'bucket_{N}'] = f'{th[j]}<.<={th[j+1]}'
        if df_lastfew.loc[row,f'points_{N}']<=0: df_lastfew.loc[row,f'bucket_{N}'] = '.<=0'

df_lastfew.info()


In [None]:
df_cummulative = pd.DataFrame()

for i in range(len(df)):
    # For a given row i, extract the code and date for that row
    date = df.loc[i,:].Date
    code = df.loc[i,:].code

    # Given a code/date, extract the previous game data
    df_cummu = df.loc[(df.Date < date) & (df.code == code)].sort_values('Date')
    
    df_cummu = df_cummu.drop(columns = ['Date', 'Opp','pitcher','code'])

    # Get the average of all rows, find the averages of averageable, transpose it from series to row
    df_cummu_avg = pd.DataFrame(df_cummu.mean(axis=0)).transpose()

    # Rename cols to indicate
    df_cummu_avg.columns = [f"{col}_all" for col in df_cummu_avg.columns]

    # Assign it to the date and code we're working with
    df_cummu_avg['Date'] = date
    df_cummu_avg['code'] = code

    # Append into the last few games df
    df_cummulative = df_cummulative.append(df_cummu_avg)

# Rest the index so that we can .loc through it
df_cummulative = df_cummulative.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(df_cummulative)):
    for j in range(len(th)-1):
        if (df_cummulative.loc[row,'points_all']>th[j]) and (df_cummulative.loc[row,'points_all']<=th[j+1]):
            df_cummulative.loc[row,'bucket_all'] = f'{th[j]}<.<={th[j+1]}'
        if df_cummulative.loc[row,'points_all']<=0: df_cummulative.loc[row,'bucket_all'] = '.<=0'

df_cummulative.info()


In [None]:
# Merge in all those different windows of data

# Merge in last 1 game of data
compiled_df = pd.merge(df,df_last,
                       on = ['Date','code'])

# Merge in last N games of data
compiled_df = pd.merge(compiled_df,df_lastfew,
                       on = ['Date','code'])

# Merge in all season games of data
compiled_df = pd.merge(compiled_df,df_cummulative,
                       on = ['Date','code'])

# Drop the data from that day, so we're only training on data that is known at the time before the given date
dont_drop_from_reqs = ['IP','Dec','Date','Opp']
reqs_to_drop = [ele for ele in reqd_cols if ele not in dont_drop_from_reqs]
drop_cols = ['QS', 'Outs', 'Dec']+wanted_cols+reqs_to_drop

pitcher_perf = compiled_df.drop(columns = drop_cols)

pitcher_perf.info()


In [None]:
# Rest the index so that we can .loc through it
pitcher_perf = pitcher_perf.reset_index(drop=True)

# Put in the performance buckets
for row in range(1,len(pitcher_perf)):
    for j in range(len(th)-1):
        if (pitcher_perf.loc[row,'points']>th[j]) and (pitcher_perf.loc[row,'points']<=th[j+1]):
            pitcher_perf.loc[row,'bucket'] = f'{th[j]}<.<={th[j+1]}'
        if pitcher_perf.loc[row,'points']<=0: pitcher_perf.loc[row,'bucket'] = '.<=0'

# These NaNs from asking for performance on the last 1, N games when there have been fewer than 1, N games
pitcher_perf = pitcher_perf.dropna()

# Export to csv
pitcher_perf.to_csv('pitcher_previous_perf.csv',index = False)

pitcher_perf


### This is the section where for each team, before a given game, I calculate their previous performance

Things it'd be cool to add here:\
In the progress printout, maybe include the number of records in each team?\

In [None]:
# Get team codes
team_codes = pd.read_csv('team_codes.csv')
codes = team_codes['code'].tolist()

if opp_update == 'y':
    ## Make last 30 days df
    df = pd.DataFrame()

    # For every team in our list of team codes
    for i in range(round(t*len(codes))):

        # Loop through teams
        team = codes[i]

        # Request setup
        url = f'https://www.baseball-reference.com/teams/tgl.cgi?team={team}&t=b&year=2023'

        team_df = pd.read_html(url)
        team_df = pd.DataFrame((team_df[0]))

        # Add team identifier
        team_df['Team'] = team

        # Append into overall team dataframe
        df = df.append(team_df)

        # Print status
        print(f'Done: {team}')

        # Wait for >3 seconds b/c site limits requests to 20/min
        time.sleep(4)

    ## Now we clean the data!
    # Build list of wanted stats
    wanted_cols = []

    for key, value in opp_stat_selector.items():
        if value == 1: wanted_cols.append(key)

    reqd_cols = ['Team', 'Date', 'PA']

    # Combine with list of required stats
    keep_cols = reqd_cols + wanted_cols

    # Select for those key columns
    df = df[keep_cols]

    # Drop rows that contain mid-table row headers
    df = df[pd.to_numeric(df['PA'], errors='coerce').notnull()]

    # Now let's make the date column usable
    # Clean away situations where there's a double-header impacting the date
    df['Date'] = df['Date'].str.split("(", expand = True).loc[:,0]

    # Extract day from date
    days = df['Date'].str.split(n=1,expand = True).loc[:,1]

    # Extract month from date
    months = df['Date'].str.split(n=1,expand = True).loc[:,0]

    # Convert month to numerical month
    months = months.str.replace('Oct','10')
    months = months.str.replace('Sep','9')
    months = months.str.replace('Aug','8')
    months = months.str.replace('Jul','7')
    months = months.str.replace('Jun','6')
    months = months.str.replace('May','5')
    months = months.str.replace('Apr','4')
    months = months.str.replace('Mar','3')

    # Create year series
    df['years'] = '2023'

    # Build date string
    df['Date'] = df['years'] + "-" + months + "-" + days

    # Drop year series
    df = df.drop(columns = ['years'])

    # Cast series as numeric/datetime
    df = df.apply(pd.to_numeric, errors = 'ignore')
    df['Date'] = pd.to_datetime(df['Date'])

    df = df.reset_index(drop = True)

    ##
    # Variable to determine the length of our rolling average
    M=30

    # Now let's make a df showing the average performance over the last N days

    # Initialize our df
    df_LM = pd.DataFrame()

    for i in range(len(df)):

        # For a given row i, extract the code and date for that row
        date = df.loc[i,:].Date
        team = df.loc[i,:].Team

        # Given a code/date, extract the previous game data
        # WHAT AM I GOING TO DO IF THERE IS NO PREVIOUS GAME DATA?!??!!
        df_lastM = df.loc[(df['Date'] < date) & (df['Team'] == team)].sort_values('Date').tail(M)

        # Get the average of those N rows, find the averages of averageable, transpose it from series to row
        df_lastM_avg = pd.DataFrame(df_lastM.mean(axis=0)).transpose()

        # Rename cols to indicate
        df_lastM_avg.columns = [f"{col}_{M}" for col in df_lastM_avg.columns]

        # Assign it to the date and code we're working with
        df_lastM_avg['Date'] = date
        df_lastM_avg['code'] = team

        # Append into the last few games df
        df_LM = df_LM.append(df_lastM_avg)

    # Drop errant team name column
    df_LM = df_LM.drop(columns = [f'Team_{M}'])

    # Relabel code column as true team name column
    df_LM = df_LM.rename(columns={"code": "Team"})

    df_LM = df_LM.reset_index(drop=True)

    ##

    # Now let's make the season running total df
    df_Lall = pd.DataFrame()

    for i in range(len(df)):

        # For a given row i, extract the code and date for that row
        date = df.loc[i,:].Date
        team = df.loc[i,:].Team

        # Given a code/date, extract the previous game data
        df_lastall = df.loc[(df['Date'] < date) & (df['Team'] == team)]

        # Get the average of those 5 rows, find the averages of averageable, transpose it from series to row
        df_lastall_avg = pd.DataFrame(df_lastall.mean(axis=0)).transpose()

        # Rename cols to indicate
        df_lastall_avg.columns = [f"{col}_all" for col in df_lastall_avg.columns]

        # Assign it to the date and code we're working with
        df_lastall_avg['Date'] = date
        df_lastall_avg['code'] = team

        # Append into the last few games df
        df_Lall = df_Lall.append(df_lastall_avg)

    # Drop errant team name column
    df_Lall = df_Lall.drop(columns = ['Team_all'])

    # Relabel code column as true team name column
    df_Lall = df_Lall.rename(columns={"code": "Team"})

    df_Lall = df_Lall.reset_index(drop=True)

    ##

    # Now let's combine these to dataframes to make an overall 'previous performance'
    opp_prev_perf = df_LM.merge(df_Lall, on = ['Team', 'Date'])

    # Drop rows that are NaN bc there weren't any previous days of performance to pull (i.e., first day in window)
    opp_prev_perf = opp_prev_perf.dropna()

    # Export to csv
    opp_prev_perf.to_csv('opp_previous_perf.csv', index = False)

    print('Done')

else: opp_prev_perf = pd.read_csv('opp_previous_perf.csv')

opp_prev_perf['Date'] = pd.to_datetime(opp_prev_perf['Date'])

opp_prev_perf.info()


This is where we make the combined df of all data to train on

In [None]:
# Read in opponent performance
opp_perf_df = opp_prev_perf

# Extract column names
cols = list(opp_perf_df.columns)

# Move date and team to front of list
cols.remove('Date')
cols.insert(0,'Date')
cols.remove('Team')
cols.insert(0,'Team')
opp_perf_df = opp_perf_df[cols]

# Read in pitcher performance
pitcher_perf_df = pitcher_perf

# Perform merge
df = pitcher_perf_df.merge(opp_perf_df,
                           left_on = ['Date', 'Opp'], right_on = ['Date', 'Team'],
                           suffixes = ('_pitcher', '_opp')
                          )
# Drop weird rows
df = df.dropna()

# Move 'bucket' column to front
cols = list(df.columns)
cols.remove('bucket')
cols.insert(0,'bucket')

# Drop 'points' column
cols.remove('points')

# Rearrange columns
df = df[cols]

training_df = df

# Export to csv
training_df.to_csv('training_data.csv', index=False)

df.info()

## This is where we build the prediction input dataset

In [None]:
# Request setup
url = "https://www.baseball-reference.com/previews/"

with urllib.request.urlopen(url) as response:
    html = response.read()

soup = BeautifulSoup(html, 'html.parser')

# Get the list of probables
probables = []
probables = pd.DataFrame()

# We're going to loop through this list of matchups
matchups = soup.find_all('div', class_='game_summary nohover')

# For each matchup in the full list of matchups
for matchup in matchups:

    # Get the first pitcher's name
    p1_name = matchup.find_all('a')[3].text

    # Get the first pitcher's code
    try: p1_code = matchup.find_all('a')[3].get('href').split("/")[5].split(".")[0]
    except: p1_code = np.nan

    # Get the first pitcher's team
    team1 = matchup.find_all('strong')[0].text

    # Get the second pitcher's name
    try: p2_name = matchup.find_all('a')[4].text
    except: p2_name = np.nan

    # Get the second pitcher's code
    try: p2_code = matchup.find_all('a')[4].get('href').split("/")[5].split(".")[0]
    except: p2_code = np.nan

    # Get the second pitcher's team
    # Had to add a function that handles if there is a debut in the matchup,
    # Which would otherwise change the list of 'strong' divs in the matchup
    debut_adj = math.ceil(len(matchup.find_all('strong'))/2)
    try: team2 = matchup.find_all('strong')[debut_adj].text
    except: team2 = np.nan

    # Create a dictionary for the first pitcher and their matchup
    pitcher1 = {
        'date':date.today(),
        'name':p1_name,
        'code':p1_code,
        'for':team1,
        'against':team2
    }

    # Create a dictionary for the second pitcher and their matchup
    pitcher2 = {
        'date':date.today(),
        'name':p2_name,
        'code':p2_code,
        'for':team2,
        'against':team1
    }

    # Put both pitchers into a list of probable matchups, separately
    probables = probables.append(pitcher1, ignore_index=True)
    probables = probables.append(pitcher2, ignore_index=True)

# Export to csv
probables.to_csv('probables.csv',index = False)

# NOTE: NaN usually means the pitcher hasn't been posted on b-ref yet

probables

In [None]:
# Read in opponent performance
probables_df = probables

# This is is the data for just the starting pitchers, to be used for predictions
all_starters = probables_df['code']
pitcher_data = pitcher_perf[pitcher_perf['code'].isin(list(all_starters))]

# This is is the data for just the starting pitchers, to be used for predictions
all_opps = probables_df['against']
all_opps = opp_prev_perf[opp_prev_perf['Team'].isin(list(all_opps))]

# Perform merge
df = pitcher_data.merge(all_opps,
                           left_on = ['Date', 'Opp'], right_on = ['Date', 'Team'],
                           suffixes = ('_pitcher', '_opp')
                          )

df_cols = list(df.columns)
df_cols.remove('code')
df_cols.insert(0,'code')

df_cols.remove('pitcher')
df_cols.insert(0,'pitcher')

df_cols.remove('Opp')
df_cols.insert(0,'Opp')

df_cols.remove('Date')
df_cols.insert(0,'Date')

df_cols.remove('points')
df_cols.insert(0,'points')

df_cols.remove('bucket')
df_cols.insert(0,'bucket')

df = df[df_cols]
df


In [None]:
# Initialize df
prediction_input = pd.DataFrame()

# Extract list of starters from probables df
starters = df['code'].unique()

# Loop through all starters
for starter in starters:
    # Get most recent row of data
    most_recent = df.loc[df['code'] == starter].sort_values('Date').tail(1)

    # Put into df to use as prediciton inputs
    prediction_input = prediction_input.append(most_recent)

# Drop target columns
prediction_input = prediction_input.drop(columns=['bucket', 'points'])

prediction_input.to_csv('STARTERS_input_data.csv', index=False)

prediction_input.info()


# This is where we actually run the actual model


In [None]:
# PREDICTION INPUT DATA PREP

# Read in input data
input_df = prediction_input

# Take out columns that we'll need in order to read our predictions
id_info = input_df[['pitcher','code']]

# Drop columns that will not be included in training
input_df = input_df.drop(columns = ['Date','pitcher','code'])

# Dropping this column separately just b/c I maybe want to come back to considering NOT dropping it
# Yeah, the opponent performance is still here, but sometimes it's just the TEAM itself that seems
# to make a difference for some teams/pitchers (i.e., Devers vs. NYY)
input_df = input_df.drop(columns = 'Opp')

# Separate target as y and features as X
Xp = input_df

# Get list of non-numerical columns
Xp_cat = Xp.dtypes[Xp.dtypes == 'object'].index.tolist()

# Get dummies for those columns
one_hot = pd.get_dummies(Xp[Xp_cat])

# Drop the original non-numerical columns
Xp = Xp.drop(columns = Xp_cat)

# Put in ALL the possible buckets
bucket1 = [f'bucket_1_{label}' for label in bucket_labels]
bucketN = [f'bucket_{N}_{label}' for label in bucket_labels]
bucketall = [f'bucket_all_{label}' for label in bucket_labels]

buckets = bucket1 + bucketN + bucketall

Xp[buckets] = 0

# update the bucket columns with the numerical dummies for those non-numerical columns
Xp.update(one_hot)

# This is officially the data we're going to put into our predictor
Xp.info()


In [None]:
## TESTING DATA PREP

# Drop columns that will not be included in training
training_df = training_df.drop(columns = ['Date','pitcher','code'])

# Dropping this column separately just b/c I maybe want to come back to considering NOT dropping it
# Yeah, the opponent performance is still here, but sometimes it's just the TEAM itself that seems
# to make a difference for some teams/pitchers (i.e., Devers vs. NYY)
training_df = training_df.drop(columns = 'Opp')


# Separate target as y and features as X
y = training_df['bucket']
X = training_df.drop(columns = 'bucket')

# Get list of non-numerical columns
X_cat = X.dtypes[X.dtypes == 'object'].index.tolist()

# Get dummies for those columns
one_hot = pd.get_dummies(X[X_cat])

# Drop the original non-numerical columns
X = X.drop(columns = X_cat)

# Put in ALL the possible buckets
bucket1 = [f'bucket_1_{label}' for label in bucket_labels]
bucketN = [f'bucket_{N}_{label}' for label in bucket_labels]
bucketall = [f'bucket_all_{label}' for label in bucket_labels]

buckets = bucket1 + bucketN + bucketall

X[buckets] = 0

# update the bucket columns with the numerical dummies for those non-numerical columns
X.update(one_hot)

# This is officially the data we're going to put into our predictor
X.info()


In [None]:
# This is us converting one vector of 6 distinct classes into basically get-dummies
from tensorflow.keras.utils import to_categorical

outcomes = bucket_labels

y_factors = to_categorical(pd.Categorical(y,categories = outcomes).codes,len(bucket_labels))


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y_factors, random_state=42)

## Preprocess numerical data for neural network
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [None]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()

p=0
nn_model.add(tf.keras.layers.Dense(units=round(0.6*len(list(X.columns))),
                                   activation="relu", input_dim = X_train.shape[1]))
nn_model.add(tf.keras.layers.Dense(units=round(0.75*0.6*len(list(X.columns))+0.25*len(bucket_labels)),
                                   activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=round(0.50*0.6*len(list(X.columns))+0.50*len(bucket_labels)),
                                   activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=round(0.25*0.6*len(list(X.columns))+0.75*len(bucket_labels)),
                                   activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=len(bucket_labels),
                                   activation="softmax"))


# Compile the Sequential model together and customize metrics
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


In [None]:
predicted_perf = nn_model.predict(Xp)
predicted_perf = pd.DataFrame(predicted_perf)
predicted_perf.columns = outcomes
predicted_perf


In [None]:
predicted_perf = predicted_perf.round(1)

predicted_perf

id_info = id_info.reset_index(drop=True)

output = id_info.join(predicted_perf)
output


In [None]:
print("--- %s minutes ---" % round((time.time() - start_time)/60,2))