# Predicting the 2022 NBA MVP

## Web Scraping Basketball Reference

In [21]:
years = list(range(1999,2022))

url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

import requests

##get data from 1999-2021 from basketball reference
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    
    with open("mvp/{}.html".format(year),"w+") as f:  ##open file and write data to each file
        f.write(data.text)

In [22]:
## parsing data in to pandas dataframe
from bs4 import BeautifulSoup
import pandas as pd

dfs = []

for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page=f.read()
    soup = BeautifulSoup(page,"html.parser")
    soup.find('tr',class_="over_header").decompose()
    mvp_table = soup.find(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["year"] = year  ## add year column 
    dfs.append(mvp)
                       
mvps = pd.concat(dfs)  ## combine the dataframes in to 1 dataframe       

Let's take a look at the data:

In [23]:
mvps

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,year
0,1,Karl Malone,35,UTA,44.0,827.0,1180,0.701,49,37.4,...,9.4,4.1,1.3,0.6,0.493,0.000,0.788,9.6,0.252,1999
1,2,Alonzo Mourning,28,MIA,36.0,773.0,1180,0.655,46,38.1,...,11.0,1.6,0.7,3.9,0.511,0.000,0.652,7.9,0.216,1999
2,3,Tim Duncan,22,SAS,30.0,740.0,1180,0.627,50,39.3,...,11.4,2.4,0.9,2.5,0.495,0.143,0.690,8.7,0.213,1999
3,4,Allen Iverson,23,PHI,5.0,319.0,1180,0.270,48,41.5,...,4.9,4.6,2.3,0.1,0.412,0.291,0.751,7.2,0.173,1999
4,5,Jason Kidd,25,PHO,2.0,159.0,1180,0.135,50,41.2,...,6.8,10.8,2.3,0.4,0.444,0.366,0.757,8.1,0.188,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.300,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021


In [24]:
mvps.to_csv("mvps.csv") ## store data in to a csv file for later use

We also need to retrieve the player stats from the years 1999-2021, so lets do that now

In [25]:
player_stats_url="https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(1991)
data = requests.get(url)

with open("player/1991.html", "w+") as f:
    f.write(data.text)

In [26]:
from selenium import webdriver

driver = webdriver.Chrome(executable_path = "/Users/aadhilmohamed/downloads/chromedriver")

  driver = webdriver.Chrome(executable_path = "/Users/aadhilmohamed/downloads/chromedriver")


In [27]:
import time

## gets data
for year in years:
    url = player_stats_url.format(year)
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    html = driver.page_source
    with open("player/{}.html".format(year),"w+") as f:
        f.write(html)

In [40]:
df = []

## parse data in pandas dataframes
for year in years:
    with open("player/{}.html".format(year)) as f:
        page=f.read()
    
    soup = BeautifulSoup(page,"html.parser")
    soup.find('tr',class_="thead").decompose()
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["year"] = year
    df.append(player)

In [41]:
players = pd.concat(df) ##combine dataframes

Let's take a look at the player data:

In [42]:
players

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,year
0,1,Tariq Abdul-Wahad,SG,24,SAC,49,49,24.6,3.6,8.3,...,1.5,2.3,3.8,1.0,1.0,0.3,1.4,2.5,9.3,1999
1,2,Shareef Abdur-Rahim,SF,22,VAN,50,50,40.4,7.7,17.9,...,2.3,5.2,7.5,3.4,1.4,1.1,3.7,2.7,23.0,1999
2,3,Cory Alexander,PG,25,DEN,36,4,21.6,2.7,7.2,...,0.2,1.9,2.1,3.3,1.0,0.1,1.9,2.1,7.3,1999
3,4,Ray Allen*,SG,23,MIL,50,50,34.4,6.1,13.5,...,1.1,3.1,4.2,3.6,1.1,0.1,2.4,2.3,17.1,1999
4,5,Peter Aluma,C,25,SAC,2,0,2.5,0.5,1.0,...,0.5,0.5,1.0,0.0,0.5,0.5,1.0,2.0,1.0,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,536,Delon Wright,PG,28,SAC,27,8,25.8,3.9,8.3,...,1.0,2.9,3.9,3.6,1.6,0.4,1.3,1.1,10.0,2021
726,537,Thaddeus Young,PF,32,CHI,68,23,24.3,5.4,9.7,...,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1,2021
727,538,Trae Young,PG,22,ATL,63,63,33.7,7.7,17.7,...,0.6,3.3,3.9,9.4,0.8,0.2,4.1,1.8,25.3,2021
728,539,Cody Zeller,C,28,CHO,48,21,20.9,3.8,6.8,...,2.5,4.4,6.8,1.8,0.6,0.4,1.1,2.5,9.4,2021


In [43]:
players.to_csv("players.csv")

Finally let's scrape team records from the website:

We'll use division data since it's preloaded we can just use requests instead of selenium

In [32]:
url_teams ="https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = url_teams.format(year)
    data = requests.get(url)

    with open("teams/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [33]:
dfs= []

for year in years:
    with open("teams/{}.html".format(year)) as f:
        page = f.read()

    ## parse eastern conference table
    soup = BeautifulSoup(page,"html.parser")
    soup.find('tr',class_="thead").decompose()
    team_table=soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["year"] = year
    team["Team"] = team["Eastern Conference"]
    del(team["Eastern Conference"])
    dfs.append(team)
    
    ## parse western conference table
    soup = BeautifulSoup(page,"html.parser")
    soup.find('tr',class_="thead").decompose()
    team_table=soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["year"] = year
    team["Team"] = team["Western Conference"]
    del(team["Western Conference"])
    dfs.append(team)

teams = pd.concat(dfs) ## combine data frames

Let's take a look at the team data

In [34]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,year,Team
0,33,17,.660,—,89.0,84.0,5.11,1999,Miami Heat*
1,33,17,.660,—,89.5,86.9,3.11,1999,Orlando Magic*
2,28,22,.560,5.0,89.7,87.6,2.56,1999,Philadelphia 76ers*
3,27,23,.540,6.0,86.4,85.4,1.45,1999,New York Knicks*
4,19,31,.380,14.0,93.0,94.9,-1.75,1999,Boston Celtics
...,...,...,...,...,...,...,...,...,...
13,42,30,.583,—,112.4,110.2,2.26,2021,Dallas Mavericks*
14,38,34,.528,4.0,113.3,112.3,1.07,2021,Memphis Grizzlies*
15,33,39,.458,9.0,111.1,112.8,-1.58,2021,San Antonio Spurs
16,31,41,.431,11.0,114.6,114.9,-0.20,2021,New Orleans Pelicans


In [35]:
teams.to_csv("teams.csv")

# Summary

In this section we web scraped mvp, player, and teams data from the basketball reference using BeautifulSoup and Illenium, storing them in to csv files for later use
