In [1]:
import requests

In [2]:
years = [i for i in range(1991, 2023)]

Start off with MVP votings from 1991 to 2022, replace year value with curly brackets for faster formatting.

In [3]:
mvpUrl = 'https://www.basketball-reference.com/awards/awards_{}.html'

In [4]:
for i in years:
    url = mvpUrl.format(i)
    data = requests.get(url)
    
    with open('MVPs/{}.html'.format(i), 'w+') as file:
        file.write(data.text)

In [5]:
from bs4 import BeautifulSoup

Read tables as .html files, use the 'Inspect' feature to identify the wanted table with its corresponding HTML ID.

In [6]:
import pandas as pd
df = []

for i in years:
    with open('MVPs/{}.html'.format(i)) as file:
        page = file.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = 'over_header').decompose()
    mvpTables = soup.find(id = 'mvp')
    
    mvp = pd.read_html(str(mvpTables))[0]
    
    mvp["MVP Year"] = i
    
    df.append(mvp)

Concate and merge into a singular DataFrame, then convert to .csv file.

In [7]:
mvps = pd.concat(df)
mvps.to_csv('mvps.csv')

The website uses JavaScript to render the table, thus for efficiency purposes the table is not displayed in full. 
There a chrome driver was used to 'force' the website to load table in entirety.

In [8]:
from selenium import webdriver

In [9]:
driver = webdriver.Chrome(executable_path='/Users/paco/Downloads/chromedriver')

  driver = webdriver.Chrome(executable_path='/Users/paco/Downloads/chromedriver')


Same steps to process Player's per-game stats and team records. -> Cell 10 to 20.

In [10]:
playerUrl = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'

In [11]:
import time

In [12]:
for i in years:
    url = playerUrl.format(i)
    
    driver.get(url)
    driver.execute_script('window.scrollTo(1,9999)')
    time.sleep(3)
    
    link = driver.page_source
    with open('Player/{}.html'.format(i), 'w+') as file:
        file.write(link)

In [13]:
dfs = []
for i in years:
    with open('Player/{}.html'.format(i)) as file:
        page = file.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = 'thead').decompose()
    playerTables = soup.find(id = 'per_game_stats')
    
    player = pd.read_html(str(playerTables))[0]
    player['Year'] = i
    dfs.append(player)

In [14]:
players = pd.concat(dfs)
players.rename(columns={'MVP Year':'Year'}, inplace=True)

In [15]:
players.to_csv('players.csv')

In [16]:
teamUrl = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'

In [17]:
for i in years:
    url = teamUrl.format(i)
    data = requests.get(url)
    
    with open('Team/{}.html'.format(i), 'w+') as file:
        file.write(data.text)

In [18]:
dfs = []
for i in years:
    with open('Team/{}.html'.format(i)) as file:
        page = file.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = 'thead').decompose()
    teamTables = soup.find(id = 'divs_standings_E')
    
    team = pd.read_html(str(teamTables))[0]
    team['Year'] = i
    team["Team"] = team['Eastern Conference']
    del team['Eastern Conference']
    dfs.append(team)
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_ = 'thead').decompose()
    teamTables = soup.find(id = 'divs_standings_W')
    
    team = pd.read_html(str(teamTables))[0]
    team['Year'] = i
    team["Team"] = team['Western Conference']
    del team['Western Conference']
    dfs.append(team)

In [19]:
teams = pd.concat(dfs)

In [20]:
teams.to_csv('teams.csv')

Note: Team record was added because it's more of a common sense for people who watch the NBA, that team record to a certain extent matters when it comes to MVP voting. In other words, one might get robbed MVP due to a less satisfactory team record, despite having god-like stats. For example, Kobe Bryant in 2005, 2006 and 2007.