In [2]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin, parse_qs, urlparse

# Define the base URL for the site
base_url = "https://www.statmuse.com"

# URL of the page to scrape
url = "https://www.statmuse.com/mlb/ask?q=2024+mlb+stats+wrc%2B+leaders"

# Correct header mapping for the table
correct_headers = [
    "Rank", "Name", "wRC+", "Season", "Team", "Games Played", "At-Bats",
    "Runs", "Hits", "Doubles", "Triples", "Home Runs", "RBIs", "Walks",
    "Hit By Pitch", "Strikeouts", "Stolen Bases", "Caught Stealing",
    "Plate Appearances", "Total Bases", "Extra Base Hits", "Sac Hits",
    "Sac Flies", "Intentional Walks", "Grounded Into Double Play",
    "Batting Average", "On-Base Percentage", "Slugging Percentage", "OPS",
    "Weighted Runs Created", "Weighted Runs Above Average", "wOBA", "BRAA"
]

# Make a request to the page
response = requests.get(url)
response.raise_for_status()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table rows
rows = soup.select('tbody > tr')

# Extract data from each row
data = []
for row in rows:
    try:
        # Extract player rank
        rank = row.select_one('td:nth-of-type(1)').text.strip()
        
        # Extract player image and parse the true URL
        img_tag = row.select_one('td:nth-of-type(2) img')
        img_src = None
        if img_tag:
            img_query = parse_qs(urlparse(img_tag['src']).query)
            img_src = img_query.get('href', [None])[0]
        
        # Extract player name
        name_tag = row.select_one('td:nth-of-type(3) a')
        name = name_tag.get('title', '').strip() if name_tag else "Unknown"
        
        # Extract other stats
        stat_cells = row.select('td:nth-of-type(n+4)')
        stats = {correct_headers[i + 2]: stat_cells[i].text.strip() for i in range(len(stat_cells))}

        # Append the row data
        data.append({
            "rank": rank,
            "name": name,
            "image": img_src,
            "stats": stats,
        })
    except Exception as e:
        print(f"Error processing row: {row}, error: {e}")

# Convert data to JSON
output_json = json.dumps(data, indent=4)

# Print the JSON
print(output_json)

# Save to a file
with open("wrc_plus_leaders_2024.json", "w") as file:
    file.write(output_json)


[
    {
        "rank": "1",
        "name": "Aaron Judge",
        "image": "https://cdn.statmuse.com/img/mlb/players/new-york-yankees-aaron-judge2022-min--6erpzlwb.png",
        "stats": {
            "wRC+": "219",
            "Season": "2024",
            "Team": "NYY",
            "Games Played": "158",
            "At-Bats": "559",
            "Runs": "122",
            "Hits": "180",
            "Doubles": "36",
            "Triples": "1",
            "Home Runs": "58",
            "RBIs": "144",
            "Walks": "133",
            "Hit By Pitch": "9",
            "Strikeouts": "171",
            "Stolen Bases": "10",
            "Caught Stealing": "0",
            "Plate Appearances": "703",
            "Total Bases": "392",
            "Extra Base Hits": "95",
            "Sac Hits": "0",
            "Sac Flies": "2",
            "Intentional Walks": "20",
            "Grounded Into Double Play": "22",
            "Batting Average": ".322",
            "On-Base Percentage"