### From basketball-reference

In [1]:
import requests 
import os
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time


In [4]:
# define directory and years to use
cwd = os.getcwd() # change for '__file__' if using .py 
years = list(range(1991,2023))


In [None]:
# import html files from basketball-reference
url_general = "https://www.basketball-reference.com/awards/awards_{}.html"

# get html files for each year
for year in years:
    url = url_general.format(year)
    data = requests.get(url) # get html dat
    filename = os.path.join(cwd, '.\\team\\{}.html'.format(year))
    
    # write html data to file
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)
        print("File {} written".format(filename))
     

In [None]:
# create data frame for MVPs
dfs =[]
mvp_folder = (cwd + '\\mvps')
if os.path.isdir(mvp_folder):
     []
else:
     os.mkdir(mvp_folder)

for year in years:

    filename = (mvp_folder + '\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='over_header').decompose()
        mvp_table = soup.find_all(id='all_mvp')
        mvp_season = pd.read_html(str(mvp_table))[0]    # without the [0] mvp_season is a list and not table 
        mvp_season['Year'] = year                       # add a column called year

        dfs.append(mvp_season)  

mvps = pd.concat(dfs)

filename = (cwd + '\\mvps.csv')
mvps.to_csv(filename)
mvps.head(3) #first N rows
mvps.tail(2)  #last N rows

In [None]:
# setup url and folders for ALL players
player_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
player_folder = (cwd + '\\players')
if os.path.isdir(player_folder):
     []
else:
     os.mkdir(player_folder)


In [None]:
# scrape data from URL and save

# this wont work because of a java issue in the website 
# https://www.youtube.com/watch?v=JGQGd-oa0l4&ab_channel=Dataquest

for year in years:
    url = player_stats_url.format(year)
    data = requests.get(url)

    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)
        

In [None]:
# initiate the driver 
option = webdriver.ChromeOptions()   
option.add_argument('headless')         # use this option not to open the browser (https://www.codegrepper.com/code-examples/python/selenium+without+opening+browser+python)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)

In [None]:
# save html data in player folder
for year in years:
    url = player_stats_url.format(year)
    print(url)
    data = driver.get(url)
    driver.execute_script('window.scrollTo(1,10000)')
    time.sleep(2)

    html = driver.page_source
    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(html)

print('finished scrapping!!')

In [None]:
# create data frame for All players
dfs =[]
for year in years:

    filename = (player_folder + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()
        player_table = soup.find_all(id='all_per_game_stats')
        player_season = pd.read_html(str(player_table))[0]      # without the [0] player_season is a list and not table 
        player_season['Year'] = year                            # add a column called year

        dfs.append(player_season)  

players = pd.concat(dfs)
filename = (cwd + '\\player_stats.csv')
players.to_csv(filename)
print('finished! Data saved in "player_stats.csv"')

finished! Data saved in "player_stats.csv"


In [None]:
# setup url and folders for STANDINGS
team_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'
team_folder = (cwd + '\\team')
if os.path.isdir(team_folder):
     []
else:
     os.mkdir(team_folder)

In [None]:
for year in years:
    url = team_stats_url.format(year)
    data = requests.get(url)

    filename = (team_folder + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)

In [None]:
# create data frame for All teams
dfs =[]
for year in years:

    filename = (team_folder + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()

        east_table = soup.find_all(id='all_divs_standings_E')
        east_season = pd.read_html(str(east_table))[0]              # without the [0] this is a list and not table 
        east_season['Year'] = year                                  # add year column
        east_season['Team'] = east_season["Eastern Conference"]     # add column "Team" wich is originally called "Eastern Conference"
        del east_season['Eastern Conference']

        dfs.append(east_season)
        
        west_table = soup.find_all(id='all_divs_standings_W')   # do the same for west
        west_season = pd.read_html(str(west_table))[0]      
        west_season['Year'] = year                          
        west_season['Team'] = west_season["Western Conference"] 
        del west_season['Western Conference']

        dfs.append(west_season)
          

teams = pd.concat(dfs)
filename = (cwd + '\\team_stats.csv')
teams.to_csv(filename)
print('finished! Data saved as "team_stats.csv" ')

finished! Data saved as "team_stats.csv" 


In [None]:
# setup url and folders for STANDINGS
url_2k = 'https://hoopshype.com/nba2k/{}'
folder_2k = (cwd + '\\2k')
if os.path.isdir(folder_2k):
     []
else:
     os.mkdir(folder_2k)

In [None]:
years = list(range(1999,2022))
for year in years:
    url = url_2k.format((str(year) + '-' + str(year + 1)))
    data = requests.get(url)

    filename = (folder_2k + '.\\{}.html'.format(year))
    with open(filename,'w+', encoding="utf-8") as f:
        f.write(data.text)

In [None]:
# create data frame for all 2k players
years = list(range(1999,2022))
dfs =[]
for year in years:

    filename = (folder_2k + '.\\{}.html'.format(year))
    with open(filename,'r',encoding="utf-8") as f:
        page = f.read()

        soup = BeautifulSoup(page,'html.parser')
        soup.find('tr', class_='thead').decompose()

        east_table = soup.find_all(id='all_divs_standings_E')
        east_season = pd.read_html(str(east_table))[0]              # without the [0] this is a list and not table 
        east_season['Year'] = year                                  # add year column
        east_season['Team'] = east_season["Eastern Conference"]     # add column "Team" wich is originally called "Eastern Conference"
        del east_season['Eastern Conference']

        dfs.append(east_season)
        
        west_table = soup.find_all(id='all_divs_standings_W')   # do the same for west
        west_season = pd.read_html(str(west_table))[0]      
        west_season['Year'] = year                          
        west_season['Team'] = west_season["Western Conference"] 
        del west_season['Western Conference']

        dfs.append(west_season)
          

teams = pd.concat(dfs)
filename = (cwd + '\\team_stats.csv')
teams.to_csv(filename)
print('finished! Data saved as "team_stats.csv" ')

'1999-2000'

## Load data from nba.com

In [None]:
# load data from NBA stats
cwd = os.getcwd() # change for '__file__' if using .py 
url = 'https://www.nba.com/stats/players/drives'
data = requests.get(url)

filename = os.path.join(cwd, 'nba_drives.html')
with open(filename, 'w+', encoding='utf-8') as f:
    f.write(data.text)

print(f"Data loaded and saved to {filename}")

Data loaded and saved to c:\Git\python_projects\data_science\NBA_machine_learning\nba_drives.html


In [None]:
# Read the HTML file
with open(filename, 'r', encoding='utf-8') as f:
    page = f.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(page, 'html.parser')

# Find the table in the HTML
table = soup.find('table')

# Read the table into a DataFrame
df = pd.read_html(str(table))[0]

# Select the required columns
df = df[['Player', 'Team', 'GP', 'W', 'L']]

# Display the DataFrame
print(df.head())

In [12]:
print(table)

<table class="DatePickerCalendar_table__I69dj"><thead class="DatePickerCalendar_head__4Yk22"><tr><th class="DatePickerCalendar_dayHeader__9dKoY">Sun</th><th class="DatePickerCalendar_dayHeader__9dKoY">Mon</th><th class="DatePickerCalendar_dayHeader__9dKoY">Tue</th><th class="DatePickerCalendar_dayHeader__9dKoY">Wed</th><th class="DatePickerCalendar_dayHeader__9dKoY">Thu</th><th class="DatePickerCalendar_dayHeader__9dKoY">Fri</th><th class="DatePickerCalendar_dayHeader__9dKoY">Sat</th></tr></thead><tbody><tr><td class="DatePickerCalendar_dpCell__7NRHn"><button class="DayButton_button__XeZqa DatePickerCalendar_dayBtn__pWgL_ DatePickerCalendar_diffMonth__d0LrV" data-content="2025-01-26" data-has-games="false" data-id="nba:games:main:select-date:date" data-section="calendar" data-text="undefined Games" data-track="click" data-type="date" disabled="" title="" type="button"><p class="DatePickerCalendar_date__6wBp8">26</p></button></td><td class="DatePickerCalendar_dpCell__7NRHn"><button clas