In [267]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

In [268]:
## To simplify webscraping I'm going to loop through each day of the season, pull all "Advanced Stats" 
## for all players for each day of the 2023 season. I'll then do the same for pitchers. Once I have all
## that data I'll worry about creating a clean data set for training a model

In [289]:
## Using options while getting the url to avoid issues with grabbing url and saving html
options = webdriver.ChromeOptions() # Adding specifications on the driver to avoid timeouts loading the url
options.add_argument('--headless')  # Keeps from opening a chrome tab 
options.add_argument('--pageLoadStrategy=none') # Including manual arguments and waiting inputs for the page to load
options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images
options.add_argument("--autoplay-policy=no-user-gesture-required")  # Disable video autoplay
options.add_argument("--disable-notifications") # Avoiding one of Fangraphs pop ups

In [290]:
## Function for getting the html from a url for hitters stats between a start date and end date
start_date = '2023-03-29' ##for now
end_date = '2023-04-03' ## testing NEED TO COME BACK AND EDIT THIS
url = 'https://www.fangraphs.com/leaders/major-league?pos=all&stats=bat&lg=all&type=1&season=2023&month=1000&season1=2023&ind=0&pageitems=2000000000&qual=0&team=0&startdate=' + start_date + '&enddate=' + end_date
def get_html(url, start_date, end_date):
    url = url
    driver = webdriver.Chrome(options = options)
    driver.set_script_timeout(30)
    driver.set_page_load_timeout(30)
    try:
        driver.get(url)
    except Exception as e:
        print("Exception occured: ", e)
    html = driver.page_source
    driver.quit()
    return html

In [291]:
## Parse the webpage html and extract the html for just the specific table
def table_parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    html_table = soup.find('div', class_='table-scroll').find('table')
    return(html_table)

In [323]:
## convert table html into a dataframe
def html_converter(html_table):
    ## Create the header for the dataframe
    head = html_table.find_all('th') 
    header = [cell.get_text(strip=True) for cell in head]
    
    ## HTML has hidden text in it that we need to get rid of
    updated_header = []
    for head in header:
        index = head.find(' ')
        if index == -1:
            updated_header.append(head)
        else:
            i = int(index/2)
            updated_header.append(head[:i])

    df = pd.DataFrame(columns=updated_header) ## Create dataframe with above header
    
    
    ## Append the rows to the dataframe
    rows = html_table.find_all('tr')[1:] ## Getting the rows (skipping the header)
    for row in rows:
        values = row.find_all('td')   
        new_row = [value.get_text(strip=True) for value in values]
        df.loc[len(df)] = new_row # Append new row to the bottom of the dataframe
    
    return df
        
   

In [294]:
html = get_html(url, start_date, end_date)
html_table = table_parse(html)

<table><thead><tr><th class="th-rank fixed">#</th><th class="align-left fixed" data-col="0" data-col-id="Name" data-stat="Name">Name</th><th class="align-left" data-col="1" data-col-id="Team" data-stat="Team">Team</th><th class="align-right" data-col="2" data-col-id="PA" data-stat="PA">PA<div class="th-tooltip undefined" style="position:fixed;max-width:300px;text-wrap:wrap;visibility:hidden"><div class="arrow-down"></div><div>PA - Plate Appearances</div></div></th><th class="align-right" data-col="3" data-col-id="BB%" data-stat="BB%">BB%<div class="th-tooltip undefined" style="position:fixed;max-width:300px;text-wrap:wrap;visibility:hidden"><div class="arrow-down"></div><div>BB% - Walk Percentage (BB/PA)</div></div></th><th class="align-right" data-col="4" data-col-id="K%" data-stat="K%">K%<div class="th-tooltip undefined" style="position:fixed;max-width:300px;text-wrap:wrap;visibility:hidden"><div class="arrow-down"></div><div>K% - Strikeout Percentage (SO/PA)</div></div></th><th clas

In [324]:
foo = html_converter(html_table)
foo

Unnamed: 0,#,Name,Team,PA,BB%,K%,BB/K,-,AVG,OBP,...,BABIP,-.1,UBR,wGDP,wSB,-.2,wRC,wRAA,wOBA,wRC+
0,1,Bryce Johnson,SFG,2,0.0%,0.0%,0.00,,.500,.500,...,.000,,0.0,0.0,0.0,,1,1.1,1.002,561
1,2,Seby Zavala,CHW,3,0.0%,33.3%,0.00,,.667,.667,...,1.000,,0.0,0.0,0.0,,2,1.6,.962,547
2,3,Carlos Pérez,OAK,1,0.0%,0.0%,0.00,,1.000,1.000,...,1.000,,0.0,0.0,0.0,,1,0.5,.883,497
3,4,Travis Jankowski,TEX,1,0.0%,0.0%,0.00,,1.000,1.000,...,1.000,,0.0,0.0,0.0,,1,0.5,.883,490
4,5,Adam Duvall,BOS,20,10.0%,15.0%,0.67,,.588,.650,...,.667,,0.0,0.1,-0.1,,10,7.1,.748,393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,757,Graham Ashcraft,CIN,0,0.0%,0.0%,0.00,,.000,.000,...,.000,,,,0.0,,0,0.0,.000,
757,758,Ken Waldichuk,OAK,0,0.0%,0.0%,0.00,,.000,.000,...,.000,,,,0.0,,0,0.0,.000,
758,759,Brett Wisely,SFG,0,0.0%,0.0%,0.00,,.000,.000,...,.000,,0.0,0.0,0.0,,0,0.0,.000,
759,760,Kodai Senga,NYM,0,0.0%,0.0%,0.00,,.000,.000,...,.000,,,,0.0,,0,0.0,.000,
