In [114]:
#Initialize a list of all NBA teams
nba_teams = [
    'ATL',  # Atlanta Hawks
    'BOS',  # Boston Celtics
    'BKN',  # Brooklyn Nets
    'CHA',  # Charlotte Hornets
    'CHI',  # Chicago Bulls
    'CLE',  # Cleveland Cavaliers
    'DAL',  # Dallas Mavericks
    'DEN',  # Denver Nuggets
    'DET',  # Detroit Pistons
    'GSW',  # Golden State Warriors
    'HOU',  # Houston Rockets
    'IND',  # Indiana Pacers
    'LAC',  # LA Clippers
    'LAL',  # Los Angeles Lakers
    'MEM',  # Memphis Grizzlies
    'MIA',  # Miami Heat
    'MIL',  # Milwaukee Bucks
    'MIN',  # Minnesota Timberwolves
    'NOP',  # New Orleans Pelicans
    'NYK',  # New York Knicks
    'OKC',  # Oklahoma City Thunder
    'ORL',  # Orlando Magic
    'PHI',  # Philadelphia 76ers
    'PHX',  # Phoenix Suns
    'POR',  # Portland Trail Blazers
    'SAC',  # Sacramento Kings
    'SAS',  # San Antonio Spurs
    'TOR',  # Toronto Raptors
    'UTA',  # Utah Jazz
    'WAS'   # Washington Wizards
]

In [115]:
#Define a url with a placeholder 
team_url = "https://www.basketball-reference.com/teams/{}/2024.html"

In [116]:
import os 

In [117]:
def create_folder(folder_name):
    try:

        # Check if the folder already exists
        if not os.path.exists(folder_name):
            
            # Create the folder
            os.makedirs(folder_name)
            
    except Exception as e:
        print(f"Error: {e}")

In [118]:
import requests
import time

#Create a new folder to holder team data if it does not already exist
folder_name = "team_data"
create_folder(folder_name)

#Iterate through teams and retrieve their data
for team in nba_teams:
    url = team_url.format(team)
    data = requests.get(url)

    #Save the data in the team_data folder as an html file
    with open("team_data/{}.html".format(team), "w+") as f:
        f.write(data.text)

    #To conform to the sports reference limit of 20req/min
    time.sleep(5)

In [119]:
from bs4 import BeautifulSoup

In [120]:
#Define the team you would like to scrape - to be replaced with web inputs
team = "TOR"

#Open and read html file from the file path
with open(f"team_data/{team}.html") as f:
    page = f.read()

In [121]:
#Create a beautiful soup html parser object
soup = BeautifulSoup(page, "html.parser")

In [122]:
# Find the table with id "roster"
roster_table = soup.find('table', {'id': 'roster'})

In [None]:
# Find all elements with the data-stat="player" attribute
roster = roster_table.find_all('td', {'data-stat': 'player'})

In [None]:
#Create a new folder to holder team data if it does not already exist
folder_name = f"{team}_player_data"
create_folder(folder_name)

# !Beware of 429 error, sports reference limits the amount of requests you can make per min. You will be jailed for one hour if run excessively

for player in roster:
    # Find the <a> tag within the <td> element
    link = player.find('a')
    
    # Check if an <a> tag is found
    if link:
        # Get the href attribute value
        url = link['href']

        game_log_url = url.replace(".html", "/gamelog/2024")
        
        # Load the content of the player's page using requests
        player_data = requests.get(f"https://www.basketball-reference.com/{game_log_url}") 

        # Get the text content (name of the player)
        player_name = link.text.strip()

        #Save the data in the team_data folder as an html file
        with open(folder_name + "/{}.html".format(player_name), "w+") as f:
            f.write(player_data.text)

         #To conform to the sports reference limit of 20req/min
        time.sleep(5)

In [129]:
import pandas as pd

In [159]:
from io import StringIO  # Import StringIO

# Structure subject to change if a dictionary does not work out
dfs = {}

# Iterate through files in the folder
for player in os.listdir(folder_name):
    file_path = os.path.join(folder_name, player)

    # Get the name of the file (excluding the path)
    player_name = os.path.basename(file_path).replace(".html","")

    #Open and read html file from the file path
    with open(f"{file_path}") as f:
        player_page = f.read()

    #Create a beautiful soup html parser object
    player_soup = BeautifulSoup(player_page, "html.parser") 

    # Remove unnecessary rows with class "thead"
    useless_rows = player_soup.find_all("tr",class_="thead")

    for row in useless_rows:
        row.decompose()

    # Find the table with id "pgl_basic"
    game_log_table = player_soup.find('table', {'id': 'pgl_basic'})

    # Convert game log html table into a dataframe
    game_log = pd.read_html(StringIO(str(game_log_table)))[0]

    dfs[player_name] = game_log

In [160]:
players = pd.concat(dfs)

In [161]:
players.tail()

Unnamed: 0,Unnamed: 1,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
RJ Barrett,42,43,37.0,2024-01-20,23-220,TOR,@,NYK,L (-26),1,28:56,...,6,8,4,1,0,0,2,20,16.5,-14
RJ Barrett,43,44,38.0,2024-01-22,23-222,TOR,,MEM,L (-8),1,37:01,...,8,9,1,0,1,2,3,29,22.1,-9
RJ Barrett,44,45,39.0,2024-01-26,23-226,TOR,,LAC,L (-20),1,36:00,...,4,6,4,1,0,3,1,22,13.7,-27
RJ Barrett,45,46,,2024-01-28,23-228,TOR,@,ATL,L (-1),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
RJ Barrett,46,47,,2024-01-30,23-230,TOR,@,CHI,W (+11),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive


In [162]:
#Convert to a csv file
players.to_csv("TOR_players.csv")