In [2]:
!pip install requests bs4 selenium



In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time

# Scraping MVP data 

In [31]:
for i in range(1991, 2025):
    data = requests.get("https://www.basketball-reference.com/awards/awards_{}.html".format(i))
    with open("data/mvp/{}.html".format(i), "w+") as f:
        f.write(data.text)

In [26]:
list_of_dfs = []
for i in range(1991, 2025):
    html_file_path = 'data/mvp/{}.html'.format(i)
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    table = soup.find('table', {'id': 'mvp'})
    
    if table:
        df = pd.read_html(StringIO(str(table)))[0]
        df["year"] = i
        list_of_dfs.append(df)
    else:
        print("Table not found in the HTML file.")

pd.concat(list_of_dfs).to_csv('mvps.csv')

# Chrome Driver + Selenium Set up

In [3]:
service = Service('/Users/arjunj/Desktop/sideProjects/nbaMvp/chromedriver-mac-arm64/chromedriver')
driver = webdriver.Chrome(service=service)

# scraping and parsing player stats and advanced stats data

In [5]:
for i in range(1991, 2025):
    driver.get("https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(i))
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    html = driver.page_source
    with open("data/playerStats/{}.html".format(i), "w+") as f:
        f.write(html)

In [27]:
list_of_dfs = []
for i in range(1991, 2025):
    html_file_path = 'data/playerStats/{}.html'.format(i)
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    table = soup.find('table', {'id': 'per_game_stats'})
    
    if table:
        df = pd.read_html(StringIO(str(table)))[0]
        df["year"] = i
        list_of_dfs.append(df)
    else:
        print("Table not found in the HTML file.")

pd.concat(list_of_dfs).to_csv('playerStats.csv')

In [5]:
for i in range(1991, 2025):
    driver.get("https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(i))
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(1)
    html = driver.page_source
    with open("data/playerStatsAdvanced/{}.html".format(i), "w+") as f:
        f.write(html)

In [3]:
list_of_dfs = []
for i in range(1991, 2025):
    html_file_path = 'data/playerStatsAdvanced/{}.html'.format(i)
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    table = soup.find('table', {'id': 'advanced'})
    
    if table:
        df = pd.read_html(StringIO(str(table)))[0]
        df["year"] = i
        list_of_dfs.append(df)
    else:
        print("Table not found in the HTML file.")

pd.concat(list_of_dfs).to_csv('playerStatsAdvanced.csv')

# Scraping Team record data

In [4]:
for i in range(1991, 2025):
    data = requests.get("https://www.basketball-reference.com/leagues/NBA_{}_standings.html".format(i))
    with open("data/standings/{}.html".format(i), "w+") as f:
        f.write(data.text)

In [19]:
list_of_dfs = []
for i in range(1991, 2025):
    html_file_path = 'data/standings/{}.html'.format(i)
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    #soup.find('tr', class_="over_header").decompose()
    tableEast = soup.find('table', {'id': 'divs_standings_E'})
    tableWest = soup.find('table', {'id': 'divs_standings_W'})
    if tableEast and tableWest:
        dfEast = pd.read_html(StringIO(str(tableEast)))[0]
        dfEast = dfEast[~dfEast['W'].str.endswith('Division', na=False)]
        dfEast["year"] = i
        dfEast.rename(columns={'Eastern Conference': 'Team'}, inplace=True)
        dfWest = pd.read_html(StringIO(str(tableWest)))[0]
        dfWest = dfWest[~dfWest['W'].str.endswith('Division', na=False)]
        dfWest["year"] = i
        dfWest.rename(columns={'Western Conference': 'Team'}, inplace=True)
        
        list_of_dfs.append(pd.concat([dfWest, dfEast], ignore_index=True))
    else:
        print("Table not found in the HTML file.")


pd.concat(list_of_dfs).to_csv('standings.csv')