In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml


def scrape_nba_game_data(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games.html"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'schedule'})
        df = pd.read_html(str(table))[0]
        return df
    else:
        print(f"Failed to retrieve data for {year}")
        return None

# Scraping NBA game data for the years 2020 to 2023
years = list(range(2020, 2025))
dfs = []

for year in years:
    print(f"Scraping data for {year}...")
    df = scrape_nba_game_data(year)
    if df is not None:
        dfs.append(df)

# Concatenate all dataframes into one
if len(dfs) > 0:
    nba_data = pd.concat(dfs, ignore_index=True)
    print("Data scraped successfully!")
    print(nba_data.head())
else:
    print("No data scraped.")

# Save the data to a CSV file
nba_data.to_csv('nba_game_data_2020_2023.csv', index=False)


Scraping data for 2020...
Scraping data for 2021...
Scraping data for 2022...
Scraping data for 2023...
Scraping data for 2024...
Data scraped successfully!
                Date Start (ET)       Visitor/Neutral  PTS  \
0  Tue, Oct 22, 2019      8:00p  New Orleans Pelicans  122   
1  Tue, Oct 22, 2019     10:30p    Los Angeles Lakers  102   
2  Wed, Oct 23, 2019      7:00p         Chicago Bulls  125   
3  Wed, Oct 23, 2019      7:00p       Detroit Pistons  119   
4  Wed, Oct 23, 2019      7:00p   Cleveland Cavaliers   85   

           Home/Neutral  PTS.1 Unnamed: 6 Unnamed: 7  Attend.  \
0       Toronto Raptors    130  Box Score         OT    20787   
1  Los Angeles Clippers    112  Box Score        NaN    19068   
2     Charlotte Hornets    126  Box Score        NaN    15424   
3        Indiana Pacers    110  Box Score        NaN    17923   
4         Orlando Magic     94  Box Score        NaN    18846   

                     Arena  Notes  
0         Scotiabank Arena    NaN  
1      

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_box_score(game_url):
    response = requests.get(game_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        box_score_table = soup.find('div', {'id': 'all_box_score'})
        if box_score_table:
            df = pd.read_html(str(box_score_table))[0]
            return df
    print(f"No box score found for {game_url}")
    return None

def scrape_nba_game_data(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games.html"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'schedule'})
        df = pd.read_html(str(table))[0]

        box_scores = []
        game_links = soup.select('.right gamelink')
        for link in game_links:
            box_score = scrape_box_score('https://www.basketball-reference.com' + link['href'])
            if box_score is not None:
                box_scores.append(box_score)
            else:
                box_scores.append(None)  # Placeholder for missing box score

        # Ensure the length of box_scores matches the length of the DataFrame's index
        while len(box_scores) < len(df):
            box_scores.append(None)

        df['Box Score'] = box_scores
        return df
    else:
        print(f"Failed to retrieve data for {year}")
        return None



# Scraping NBA game data for the years 2020 to 2023
years = list(range(2020, 2024))
dfs = []

for year in years:
    print(f"Scraping data for {year}...")
    df = scrape_nba_game_data(year)
    if df is not None:
        dfs.append(df)

# Concatenate all dataframes into one
if len(dfs) > 0:
    nba_data = pd.concat(dfs, ignore_index=True)
    print("Data scraped successfully!")
    print(nba_data.head())
else:
    print("No data scraped.")

# Save the data to a CSV file
nba_data.to_csv('nba_game_data_with_box_score_2020_2023.csv', index=False)


Scraping data for 2020...
Scraping data for 2021...
Scraping data for 2022...
Scraping data for 2023...
Data scraped successfully!
                Date Start (ET)       Visitor/Neutral  PTS  \
0  Tue, Oct 22, 2019      8:00p  New Orleans Pelicans  122   
1  Tue, Oct 22, 2019     10:30p    Los Angeles Lakers  102   
2  Wed, Oct 23, 2019      7:00p         Chicago Bulls  125   
3  Wed, Oct 23, 2019      7:00p       Detroit Pistons  119   
4  Wed, Oct 23, 2019      7:00p   Cleveland Cavaliers   85   

           Home/Neutral  PTS.1 Unnamed: 6 Unnamed: 7  Attend.  \
0       Toronto Raptors    130  Box Score         OT    20787   
1  Los Angeles Clippers    112  Box Score        NaN    19068   
2     Charlotte Hornets    126  Box Score        NaN    15424   
3        Indiana Pacers    110  Box Score        NaN    17923   
4         Orlando Magic     94  Box Score        NaN    18846   

                     Arena  Notes Box Score  
0         Scotiabank Arena    NaN      None  
1           S

In [10]:
async def scrape_game(standings_file):
    with open(standings_file, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    hrefs = [l.get('href') for l in links]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in hrefs if l and "boxscore" in l and '.html' in l]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = await get_html(url, "#content")
        if not html:
            continue
        with open(save_path, "w+") as f:
            f.write(html)

In [29]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_box_score_stats():
    box_scores_data = []

    # Iterate over the range of seasons and dates
    for year in range(2024, 2025):
        for month in range(1, 13):
            for day in range(1, 32):
                date_str = f'{year}{month:02d}{day:02d}'
                box_score_url = f"https://www.basketball-reference.com/boxscores/{date_str}0.html"

                # Fetch HTML content of the box score page
                response = requests.get(box_score_url)
                if response.status_code != 200:
                    continue

                soup = BeautifulSoup(response.content, 'html.parser')

                # Extract team names and their basic box score statistics
                teams = soup.find_all("div", class_="scorebox")

                for team in teams:
                    team_name = team.find("a", class_="loser").text.strip() if team.find("a", class_="loser") else team.find("strong").text.strip()
                    basic_stats = team.find_all("tfoot")[0].find_all("td")[:18]  # Extract basic box score statistics

                    # Extract relevant basic box score statistics
                    points = basic_stats[0].text
                    field_goals = basic_stats[1].text
                    field_goal_attempts = basic_stats[2].text
                    # Add more statistics as needed

                    box_scores_data.append({
                        'Date': date_str,
                        'Team': team_name,
                        'Points': points,
                        'Field Goals': field_goals,
                        'Field Goal Attempts': field_goal_attempts,
                        # Add more statistics as needed
                    })

    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(box_scores_data)
    return df

# Example usage:
box_score_stats_df = scrape_box_score_stats()
print(box_score_stats_df)


Empty DataFrame
Columns: []
Index: []
