### Data Wrangling MLB Project Scraped Data
Allison Elmore 

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

### Scrape Data from ESPN

Here is the link to the website I scraped 
https://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2024 

In [6]:
# Initialize Selenium WebDriver (normal visible browser)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2024")

# Create a list to store all the scraped data
all_data = []

# Wait for the table to load and first page to load 
# Had to increase time to give the page time to fully load 
WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'table.tablehead'))
)

# Loop to scrape data across multiple pages
while True:
    # Get the current page URL to check if it's the last page 
    # Had to specify what page to stop on since the "next" button continues on the bottom even if there is no more data 
    current_url = driver.current_url
    if "start/352" in current_url:  
        print("Reached the last page: " + current_url)
        break

    # Wait for the rows to be available on the page
    # oddrow and evenrow are what the tabled is coded with on the website 
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tr.oddrow, tr.evenrow'))
    )

    # Get all rows (both odd and even rows) that contain player data 
    # This is code to select all rows in the table 
    rows = driver.find_elements(By.CSS_SELECTOR, "tr.oddrow, tr.evenrow")

    # This is extracting all columns for each row. The table is created left to right across by player 
    for row in rows:
        # Extract all columns in this row
        cols = row.find_elements(By.TAG_NAME, 'td')

        # Make sure we have the expected number of columns (usually 16) 
        # Here I am scraping the player and years column first since they were coded different 
        # Then I am scraping from the years column on as they are all coded the same 
        if len(cols) >= 16:  
            player = cols[1].text.strip()  
            years = cols[2].text.strip()  
            stats = [col.text.strip() for col in cols[3:]] 
            
            # Combine player data and statistics into a dictionary
            player_data = {'Player': player, 'Yrs': years}
            stat_names = ["G", "AB", "R", "H", "2B", "3B", "HR", "RBI", "BB", "SO", "SB", "CS", "BA"]
            # Here I am putting the data frame above into the dictionary 
            # using zip to but to lists into one pairing. In this case the column headers and the data that belongs with each 
            player_data.update(dict(zip(stat_names, stats))) 

            # Append the player's data to the list
            all_data.append(player_data)

    # Check for the "Next" button and click it to go to the next page
    try:
        next_button = driver.find_element(By.XPATH, '//a[contains(text(), "NEXT")]')
        # Click the "NEXT" button to go to the next page 
        next_button.click()  
        # Wait for the next page to load 
        time.sleep(5)  
    except Exception as e:
        print(f"Error while clicking NEXT: {e}")
        break  # If no "NEXT" button, exit the loop

# Close the WebDriver after scraping
driver.quit()

# Convert the list of player data into a Pandas DataFrame
espn_data = pd.DataFrame(all_data)

# Display the scraped data
print(espn_data) 

Reached the last page: https://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2024/start/352
                    Player Yrs    G   AB    R    H  2B  3B  HR  RBI   BB   SO  \
0            Jose Iglesias  12   85  270   39   91  16   1   4   26   12   39   
1           Bobby Witt Jr.   3  161  636  125  211  45  11  32  109   57  106   
2           Xavier Edwards   2   70  265   39   87  12   5   1   26   33   52   
3    Vladimir Guerrero Jr.   6  159  616   98  199  44   1  30  103   72   96   
4              Aaron Judge   9  158  559  122  180  36   1  58  144  133  171   
..                     ...  ..  ...  ...  ...  ...  ..  ..  ..  ...  ...  ...   
346          Eddie Rosario  10   91  297   33   52  12   0  10   35   16   77   
347           Mitch Garver   8  114  367   37   63  17   0  15   51   53  133   
348          Brandon Drury  10   97  325   28   55   7   0   4   15   27   81   
349             Joey Gallo  10   76  223   24   36   9   0  10   27   32  102   
350   

In [8]:
display(espn_data)

Unnamed: 0,Player,Yrs,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
0,Jose Iglesias,12,85,270,39,91,16,1,4,26,12,39,6,2,.337
1,Bobby Witt Jr.,3,161,636,125,211,45,11,32,109,57,106,31,12,.332
2,Xavier Edwards,2,70,265,39,87,12,5,1,26,33,52,31,4,.328
3,Vladimir Guerrero Jr.,6,159,616,98,199,44,1,30,103,72,96,2,2,.323
4,Aaron Judge,9,158,559,122,180,36,1,58,144,133,171,10,0,.322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,Eddie Rosario,10,91,297,33,52,12,0,10,35,16,77,9,1,.175
347,Mitch Garver,8,114,367,37,63,17,0,15,51,53,133,0,0,.172
348,Brandon Drury,10,97,325,28,55,7,0,4,15,27,81,1,0,.169
349,Joey Gallo,10,76,223,24,36,9,0,10,27,32,102,3,1,.161


In [10]:
# Save the DataFrame to a CSV file 
espn_data.to_csv('scraped_espn_data_dwproject.csv', index=False)