In [1]:
import pandas as pd
import numpy
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

# Scraping from Basketball Reference

In [2]:
#Obtain the data of MVP Voting from the last 20 years

years = list(range(2000, 2022))

url_mvp = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    url = url_mvp.format(year)
    data = requests.get(url)
    
    with open("mvp/{}.html".format(year), "w+",  encoding="utf-8") as f:
        f.write(data.text)

In [3]:
#Read in html data 
with open("mvp/2000.html", encoding= "utf-8") as f:
    page = f.read()

In [4]:
#Parse pages with BeautifulSoup

soup = BeautifulSoup(page, "html.parser")

#Remove unnecssary header on the MVP page - this will create an extra row when loaded into Pandas which may cause headache down the line
#Find this extra header row to remove
soup.find("tr", class_="over_header").decompose()


In [5]:
#Find this specific table 
mvp_2000_html = soup.find(id = "mvp")

In [6]:
#Use Pandas to read html file into a dataframe
mvp_2000 = pd.read_html(str(mvp_2000_html))

mvp_2000[0] #Here's the dataframe for the 1999 - 2000 season


Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Shaquille O'Neal,27,LAL,120.0,1207.0,1210,0.998,79,40.0,29.7,13.6,3.8,0.5,3.0,0.574,0.0,0.524,18.6,0.283
1,2,Kevin Garnett,23,MIN,0.0,408.0,1210,0.337,81,40.0,22.9,11.8,5.0,1.5,1.6,0.497,0.37,0.765,11.6,0.172
2,3,Alonzo Mourning,29,MIA,0.0,367.0,1210,0.303,79,34.8,21.7,9.5,1.6,0.5,3.7,0.551,0.0,0.711,12.9,0.226
3,4,Karl Malone,36,UTA,0.0,312.0,1210,0.258,82,35.9,25.5,9.5,3.7,1.0,0.9,0.509,0.25,0.797,15.3,0.249
4,5,Tim Duncan,23,SAS,0.0,248.0,1210,0.205,74,38.9,23.2,12.4,3.2,0.9,2.2,0.49,0.091,0.761,13.0,0.218
5,6,Gary Payton,31,SEA,0.0,180.0,1210,0.149,82,41.8,24.2,6.5,8.9,1.9,0.2,0.448,0.34,0.735,13.9,0.195
6,7,Allen Iverson,24,PHI,1.0,132.0,1210,0.109,70,40.8,28.4,3.8,4.7,2.1,0.1,0.421,0.341,0.713,6.9,0.116
7,8,Grant Hill,27,DET,0.0,113.0,1210,0.093,74,37.5,25.8,6.6,5.2,1.4,0.6,0.489,0.347,0.795,10.7,0.185
8,9,Chris Webber,26,SAC,0.0,96.0,1210,0.079,75,38.4,24.5,10.5,4.6,1.6,1.7,0.483,0.284,0.751,10.7,0.179
9,10,Vince Carter,23,TOR,0.0,51.0,1210,0.042,82,38.1,25.7,5.8,3.9,1.3,1.1,0.465,0.403,0.791,11.8,0.182


In [7]:
#For loop to combine all the html into a list of dataframes
all_mvp_years = []

for year in years:
    with open("mvp/{}.html".format(year), encoding= "utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_ = "over_header").decompose()
    mvp_table = soup.find(id= "mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    all_mvp_years.append(mvp)

In [8]:
mvps_df = pd.concat(all_mvp_years)

mvps_df

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Shaquille O'Neal,27,LAL,120.0,1207.0,1210,0.998,79,40.0,...,13.6,3.8,0.5,3.0,0.574,0.000,0.524,18.6,0.283,2000
1,2,Kevin Garnett,23,MIN,0.0,408.0,1210,0.337,81,40.0,...,11.8,5.0,1.5,1.6,0.497,0.370,0.765,11.6,0.172,2000
2,3,Alonzo Mourning,29,MIA,0.0,367.0,1210,0.303,79,34.8,...,9.5,1.6,0.5,3.7,0.551,0.000,0.711,12.9,0.226,2000
3,4,Karl Malone,36,UTA,0.0,312.0,1210,0.258,82,35.9,...,9.5,3.7,1.0,0.9,0.509,0.250,0.797,15.3,0.249,2000
4,5,Tim Duncan,23,SAS,0.0,248.0,1210,0.205,74,38.9,...,12.4,3.2,0.9,2.2,0.490,0.091,0.761,13.0,0.218,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.300,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021


In [9]:
mvps_df.to_csv("files/mvps.csv")

# Scraping Player Stats by Season

In [10]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
url = player_stats_url.format(2021)
data = requests.get(url)

with open("player_per_game/2021.html", "w+", encoding= "utf-8") as f:
    f.write(data.text)

# Using Selenium to Scrape Javascript Pages

In [11]:
driver = webdriver.Chrome(executable_path="/Users/alexc/Desktop/NBA Scrape Project/chromedriver")

  driver = webdriver.Chrome(executable_path="/Users/alexc/Desktop/NBA Scrape Project/chromedriver")


In [12]:
import time 
year = 2000
url = player_stats_url.format(year)

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(2)

html = driver.page_source

In [14]:
with open("player/{}.html".format(year), "w+", encoding='utf-8') as f:
          f.write(html)

In [16]:
for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)

    html = driver.page_source
    
    with open("player/{}.html".format(year), "w+", encoding = 'utf-8') as f:
        f.write(html)

In [17]:
#For loop to combine all the html into a list of dataframes
player_per_game = []

for year in years:
    with open("player/{}.html".format(year), encoding= "utf-8") as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find("tr", class_ = "thead").decompose()
    player_table = soup.find(id= "per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    
    player_per_game.append(player)

In [30]:
player_per_game_df = pd.concat(player_per_game)
player_per_game_df.to_csv("files/player_per_game.csv")

# Team Records Per Year

In [20]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [23]:
for year in years:
    url = team_stats_url.format(year)

    data = requests.get(url)

    with open ("team/{}.html".format(year), "w+", encoding = "utf-8") as f:
        f.write(data.text)

In [27]:
dfs = []

for year in years:
    with open("team/{}.html".format(year), encoding = "utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_ = "thead").decompose()
        team_table = soup.find(id= "divs_standings_E")
        team = pd.read_html(str(team_table))[0]
        team["Year"] = year
        team["Team"] = team["Eastern Conference"] 
        del team["Eastern Conference"]
        dfs.append(team)

        soup = BeautifulSoup(page, "html.parser")
        soup.find("tr", class_ = "thead").decompose()
        team_table = soup.find(id= "divs_standings_W")
        team = pd.read_html(str(team_table))[0]
        team["Year"] = year
        team["Team"] = team["Western Conference"] 
        del team["Western Conference"] 
        dfs.append(team)

In [29]:
teams = pd.concat(dfs)
teams.to_csv("files/team_standings.csv")

In [None]:
cols = player_per_game_df.columns.drop(['Rk', 'Player', 'Pos', 'Tm'])



player_per_game_df[cols] = player_per_game_df[cols].apply(pd.to_numeric, errors='coerce')
# player_per_game_df = player_per_game_df.astype({"Age": "int", "G" : "int", "GS" : "int"}) 

player_per_game_df.isnull().sum()
player_per_game_df.shape