In [2]:
import requests 
import os
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time


In [3]:
# define directory and years to use
cwd = os.getcwd() # change for '__file__' if using .py 
years = list(range(1991,2023))


In [None]:
# import data (skip if it has been done)
url_general = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    url = url_general.format(year)
    data = requests.get(url)

    filename = os.path.join(cwd, '.\\results\\{}.html'.format(year))
    # filename = os.path.join('results\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)
     

In [39]:
# create data frame for MVPs
dfs =[]
mvp_folder = (cwd + '\\mvps')
if os.path.isdir(mvp_folder):
     []
else:
     os.mkdir(mvp_folder)

for year in years:

    filename = (mvp_folder + '\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='over_header').decompose()
        mvp_table = soup.find_all(id='all_mvp')
        mvp_season = pd.read_html(str(mvp_table))[0]    # without the [0] mvp_season is a list and not table 
        mvp_season['Year'] = year                       # add a column called year

        dfs.append(mvp_season)  

mvps = pd.concat(dfs)

filename = (cwd + '\\mvps.csv')
mvps.to_csv(filename)
mvps.head(3) #first N rows
mvps.tail(2)  #last N rows

In [41]:
# setup url and folders for ALL players
player_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
player_folder = (cwd + '\\players')
if os.path.isdir(player_folder):
     []
else:
     os.mkdir(player_folder)


In [49]:
# scrape data from URL and save

# this wont work because of a java issue in the website 
# https://www.youtube.com/watch?v=JGQGd-oa0l4&ab_channel=Dataquest

for year in years:
    url = player_stats_url.format(year)
    data = requests.get(url)

    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)
        

In [None]:
# initiate the driver 
option = webdriver.ChromeOptions()   
option.add_argument('headless')         # use this option not to open the browser (https://www.codegrepper.com/code-examples/python/selenium+without+opening+browser+python)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)

In [None]:
# save html data in player folder
for year in years:
    url = player_stats_url.format(year)
    print(url)
    data = driver.get(url)
    driver.execute_script('window.scrollTo(1,10000)')
    time.sleep(2)

    html = driver.page_source
    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(html)

print('finished scrapping!!')

In [43]:
# create data frame for All players
dfs =[]
for year in years:

    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()
        player_table = soup.find_all(id='all_per_game_stats')
        player_season = pd.read_html(str(player_table))[0]      # without the [0] player_season is a list and not table 
        player_season['Year'] = year                            # add a column called year

        dfs.append(player_season)  

players = pd.concat(dfs)
filename = (cwd + '\\player_stats.csv')
players.to_csv(filename)
print('finished! Data saved in "player_stats.csv"')

finished! Data saved in "player_stats.csv"


In [20]:
# setup url and folders for STANDINGS
team_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'
team_folder = (cwd + '\\team')
if os.path.isdir(team_folder):
     []
else:
     os.mkdir(team_folder)

In [21]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)

    filename = (team_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)

In [44]:
# create data frame for All teams
dfs =[]
for year in years:

    filename = (team_folder + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()

        east_table = soup.find_all(id='all_divs_standings_E')
        east_season = pd.read_html(str(east_table))[0]              # without the [0] this is a list and not table 
        east_season['Year'] = year                                  # add year column
        east_season['Team'] = east_season["Eastern Conference"]     # add column "Team" wich is originally called "Eastern Conference"
        del east_season['Eastern Conference']

        dfs.append(east_season)
        
        west_table = soup.find_all(id='all_divs_standings_W')   # do the same for west
        west_season = pd.read_html(str(west_table))[0]      
        west_season['Year'] = year                          
        west_season['Team'] = west_season["Western Conference"] 
        del west_season['Western Conference']

        dfs.append(west_season)
          

teams = pd.concat(dfs)
filename = (cwd + '\\team_stats.csv')
teams.to_csv(filename)
print('finished! Data saved as "team_stats.csv" ')

finished! Data saved as "team_stats.csv" 


In [14]:
# setup url and folders for STANDINGS
url_2k = 'https://hoopshype.com/nba2k/{}'
folder_2k = (cwd + '\\2k')
if os.path.isdir(folder_2k):
     []
else:
     os.mkdir(folder_2k)

In [23]:
years = list(range(1999,2022))
for year in years:
    url = url_2k.format((str(year) + '-' + str(year + 1)))
    data = requests.get(url)

    filename = (folder_2k + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)

In [18]:
# create data frame for all 2k players
years = list(range(1999,2022))
dfs =[]
for year in years:

    filename = (folder_2k + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()

        east_table = soup.find_all(id='all_divs_standings_E')
        east_season = pd.read_html(str(east_table))[0]              # without the [0] this is a list and not table 
        east_season['Year'] = year                                  # add year column
        east_season['Team'] = east_season["Eastern Conference"]     # add column "Team" wich is originally called "Eastern Conference"
        del east_season['Eastern Conference']

        dfs.append(east_season)
        
        west_table = soup.find_all(id='all_divs_standings_W')   # do the same for west
        west_season = pd.read_html(str(west_table))[0]      
        west_season['Year'] = year                          
        west_season['Team'] = west_season["Western Conference"] 
        del west_season['Western Conference']

        dfs.append(west_season)
          

teams = pd.concat(dfs)
filename = (cwd + '\\team_stats.csv')
teams.to_csv(filename)
print('finished! Data saved as "team_stats.csv" ')

'1999-2000'

{6}
