In [2]:
years = list(range(1991,2023))
mvp_url = "https://www.basketball-reference.com/awards/awards_{}.html"

In [3]:
import requests

# scrape basketball-reference.com for past 20 years
for year in years:
  url = mvp_url.format(year)
  data = requests.get(url)

  # store html in raw_data folder
  with open('raw_data/mvp/{}.html'.format(year), 'w+') as f:
    f.write(data.text)



In [4]:
from bs4 import BeautifulSoup
import pandas as pd

mvp_dfs = []
for year in years:
  with open('raw_data/mvp/{}.html'.format(year), 'r') as f: 
    page = f.read()
  soup = BeautifulSoup(page, 'html.parser') # using bs4 to create parse class

  soup.find('tr', class_='over_header').decompose()
  mvp_table = soup.find(id='mvp')
  mvp = pd.read_html(str(mvp_table))[0] # getting mvp table in dataframe
  mvp["Year"] = year

  mvp_dfs.append(mvp)

In [5]:
# combine all data into csv file
mvps = pd.concat(mvp_dfs)
mvps.to_csv('mvps.csv')

In [8]:
# adding chrome webdriver for colab
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [9]:
from selenium import webdriver

# options to make sure colab doesn't crash
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome('chromedriver', options=chrome_options)

In [10]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [11]:
import time

# use selenium to scrape player stats with javascript
for year in years:
  url = player_stats_url.format(year)
  wd.get(url)
  wd.execute_script('window.scrollTo(1,10000)')
  time.sleep(2)

  html = wd.page_source
  with open('raw_data/player_stats/{}.html'.format(year), 'w+') as f:
    f.write(html)

In [12]:
player_dfs = []

for year in years:
  with open('raw_data/player_stats/{}.html'.format(year)) as f:
    page = f.read()
  soup = BeautifulSoup(page, 'html.parser') # using bs4 to create parse class

  soup.find('tr', class_='thead').decompose()
  player_table = soup.find(id='per_game_stats')
  player = pd.read_html(str(player_table))[0] # getting player table in dataframe
  player["Year"] = year

  player_dfs.append(player)

In [13]:
players = pd.concat(player_dfs)
players.to_csv("players.csv")

In [16]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [17]:
# scrape basketball-reference.com for past 20 years
for year in years:
  url = team_stats_url.format(year)
  data = requests.get(url)

  # store html in raw_data folder
  with open('raw_data/team/{}.html'.format(year), 'w+') as f:
    f.write(data.text)

In [18]:
team_dfs = []
for year in years:
  with open('raw_data/team/{}.html'.format(year), 'r') as f: 
    page = f.read()

  soup = BeautifulSoup(page, 'html.parser') # using bs4 to create parse class
  soup.find('tr', class_='thead').decompose()

  # for eastern conference
  e_table = soup.find_all(id='divs_standings_E')[0]
  e_df = pd.read_html(str(e_table))[0] # getting team table in dataframe
  e_df["Year"] = year
  e_df["Team"] = e_df["Eastern Conference"]
  del e_df["Eastern Conference"]
  team_dfs.append(e_df)

  # for western conference
  w_table = soup.find_all(id='divs_standings_W')[0]
  w_df = pd.read_html(str(w_table))[0] # getting team table in dataframe
  w_df["Year"] = year
  w_df["Team"] = w_df["Western Conference"]
  del w_df["Western Conference"]
  team_dfs.append(w_df)

In [22]:
teams = pd.concat(team_dfs)
teams.to_csv("teams.csv")