In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
try:
    from curl_cffi import requests as crequests
except:
    ! pip install curl_cffi
    from curl_cffi import requests as crequests

#### Functions

In [2]:
def scrape_mvps(str_url):
    # get request
    r = crequests.get(
        str_url,
        headers=BROWSER_HEADERS,
        impersonate="chrome",
        timeout=60,
    )
    r.raise_for_status()
    # decode
    if not r.encoding:
        r.encoding = r.apparent_encoding
    # get soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    table = soup.find("table", id="mvp_NBA")
    # get the body
    tbody = table.find("tbody")
    # get rows
    rows = tbody.find_all("tr")
    
    list_dict_row = []
    for row in rows:
        # get tds
        tds = row.find_all(['th','td'])
        # get season
        str_season = tds[0].get_text(strip=True)
        # get name
        str_name = tds[2].get_text(strip=True)
        # row
        dict_row = {
            'year': str_season,
            'player': str_name,
        }
        # append
        list_dict_row.append(dict_row)
    
    # make df
    df = pd.DataFrame(list_dict_row)
    return df

#### Constants

In [3]:
str_dirname_output = './output'

int_time_sleep = 2

# headers
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.basketball-reference.com/",
    "Upgrade-Insecure-Requests": "1",
}

#### Output

In [4]:
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

#### Scrape NBA MVPs

In [5]:
str_url = 'https://www.basketball-reference.com/awards/mvp.html'
# scrape
df = scrape_mvps(str_url=str_url)

# show
df

Unnamed: 0,year,player
0,2024-25,Shai Gilgeous-Alexander
1,2023-24,Nikola Jokić
2,2022-23,Joel Embiid
3,2021-22,Nikola Jokić
4,2020-21,Nikola Jokić
...,...,...
65,1959-60,Wilt Chamberlain
66,1958-59,Bob Pettit
67,1957-58,Bill Russell
68,1956-57,Bob Cousy


#### Save output

In [6]:
str_filename = 'df.csv'
str_filepath = f'{str_dirname_output}/{str_filename}'
df.to_csv(str_filepath, index=False)