In [5]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin, parse_qs, urlparse

# Define the base URL for the site
base_url = "https://www.statmuse.com"

# URL of the page to scrape
url = "https://www.statmuse.com/mlb/ask/whip-leaders-2024"

# Correct header mapping for the table
correct_headers = [
    "WHIP", "Year", "Team", "Games Played", "Wins", "Losses", "Win %",
    "Games Started", "ERA", "Innings Pitched", "Saves", "Blown Saves",
    "Shutouts", "Strikeouts", "Walks", "Hits", "Earned Runs", "Runs",
    "Home Runs", "Hit Batters", "Games Finished", "Intentional Walks",
    "Total Batters Faced", "Wild Pitches", "K/9", "BB/9", "K/BB", "HR/9",
    "K%", "BB%", "BABIP", "Run Average (RA)", "ERA-", "FIP", "FIP-", "H/9",
    "Pitcher Runs Above Average (PRAA)"
]

# Make a request to the page
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table rows
rows = soup.select('tbody > tr')

# Extract data from each row
data = []
for row in rows:
    try:
        # Extract player rank
        rank = row.select_one('td:nth-of-type(1)').text.strip()
        
        # Extract player image and parse the true URL
        img_tag = row.select_one('td:nth-of-type(2) img')
        img_src = None
        if img_tag:
            img_query = parse_qs(urlparse(img_tag['src']).query)
            img_src = img_query.get('href', [None])[0]
        
        # Extract player name
        name_tag = row.select_one('td:nth-of-type(3) a')
        name = name_tag.get('title', '').strip() if name_tag else "Unknown"
        
        # Extract other stats
        stat_cells = row.select('td:nth-of-type(n+4)')
        stats = {correct_headers[i]: stat_cells[i].text.strip() for i in range(len(stat_cells))}

        # Append the row data
        data.append({
            "rank": rank,
            "name": name,
            "image": img_src,
            "stats": stats,
        })
    except Exception as e:
        print(f"Error processing row: {row}, error: {e}")

# Convert data to JSON
output_json = json.dumps(data, indent=4)

# Print the JSON
print(output_json)

# Save to a file
with open("whip_leaders_2024.json", "w") as file:
    file.write(output_json)


[
    {
        "rank": "1",
        "name": "Logan Gilbert",
        "image": "https://cdn.statmuse.com/img/mlb/players/seattle-mariners-silhouette--hov_pdm7.png",
        "stats": {
            "WHIP": "0.89",
            "Year": "2024",
            "Team": "SEA",
            "Games Played": "33",
            "Wins": "8",
            "Losses": "12",
            "Win %": ".400",
            "Games Started": "33",
            "ERA": "3.23",
            "Innings Pitched": "220",
            "Saves": "1",
            "Blown Saves": "0",
            "Shutouts": "0",
            "Strikeouts": "208.2",
            "Walks": "148",
            "Hits": "75",
            "Earned Runs": "83",
            "Runs": "26",
            "Home Runs": "37",
            "Hit Batters": "4",
            "Games Finished": "0",
            "Intentional Walks": "1",
            "Total Batters Faced": "803",
            "Wild Pitches": "11",
            "K/9": "9.49",
            "BB/9": "1.60",
            "K/