In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def scrape_data_season(team, year):
    """
    Scrape NBA game data for a specific team and season from the Basketball Reference website.

    Parameters:
        team (str): The abbreviation of the NBA team for which to scrape the data.
        year (int): The year of the NBA season to scrape the data for.

    Returns:
        pandas.DataFrame: A DataFrame containing the scraped game data for the specified team and season.

    Functionality:
        1. Constructs the URL for the team's games page on the Basketball Reference website.
        2. Sends a GET request to the URL and retrieves the HTML content of the page.
        3. Uses BeautifulSoup to parse the HTML content and find the table containing the game data.
        4. Iterates over each row in the table and extracts the desired information from the corresponding table cells.
        5. Appends the extracted data to respective lists for each column.
        6. Creates a DataFrame using the extracted data, with columns representing different attributes of the game data.
        7. Removes rows where the value in the 'G' column is "G" from the DataFrame.
        8. Creates a directory based on the `year` and `team` parameters to store the scraped data.
        9. Saves the DataFrame as a CSV file in the specified directory.
        10. Returns the DataFrame containing the scraped game data.
    """
    # HTML content
    url = f'https://www.basketball-reference.com/teams/{team}/{year}_games.html'
    response = requests.get(url)
    
    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element
    table = soup.find('table', class_='sortable stats_table')
    
    # Find all the rows in the table body
    rows = table.tbody.find_all('tr')
    
    # Create empty lists to store the extracted data
    game_number = []
    dates = []
    start_times = []
    networks = []
    visitor_neutrals = []
    visitor_pts_list = []
    home_neutrals = []
    home_pts_list = []
    box_score_links = []
    arenas = []
    game_results = []
    overtimes = []
    team_pts = []
    opp_pts = []
    wins = []
    losses = []
    game_streaks = []
    notes = []
    
    # Iterate over each row and extract the desired information
    for row in rows:
        game_num = row.find('th', {'data-stat': 'g'})
        date = row.find('td', {'data-stat': 'date_game'})
        start_time = row.find('td', {'data-stat': 'game_start_time'})
        network = row.find('td', {'data-stat': 'network'})
        visitor_neutral = row.find('td', {'data-stat': 'opp_name'})
        visitor_pts = row.find('td', {'data-stat': 'opp_pts'})
        home_neutral = team
        home_pts = row.find('td', {'data-stat': 'pts'})
        box_score_link = row.find('td', {'data-stat': 'box_score_text'})
        arena = row.find('td', {'data-stat': 'game_location'})
        game_result = row.find('td', {'data-stat': 'game_result'})
        overtime = row.find('td', {'data-stat': 'overtimes'})
        team_pt = row.find('td', {'data-stat': 'pts'})
        opp_pt = row.find('td', {'data-stat': 'opp_pts'})
        win = row.find('td', {'data-stat': 'wins'})
        loss = row.find('td', {'data-stat': 'losses'})
        game_streak = row.find('td', {'data-stat': 'game_streak'})
        note = row.find('td', {'data-stat': 'game_remarks'})
        
        game_number.append(game_num.text.strip() if game_num else '')
        dates.append(date.text.strip() if date else '')
        start_times.append(start_time.text.strip() if start_time else '')
        networks.append(network.text.strip() if network else '')
        visitor_neutrals.append(visitor_neutral.text.strip() if visitor_neutral else '')
        visitor_pts_list.append(visitor_pts.text.strip() if visitor_pts else '')
        home_neutrals.append(home_neutral)
        home_pts_list.append(home_pts.text.strip() if home_pts else '')
        box_score_links.append('https://www.basketball-reference.com' + box_score_link.a['href'] if box_score_link and box_score_link.a else '')
        arenas.append('Home' if arena and arena.text.strip() == '' else 'Away')
        game_results.append(game_result.text.strip() if game_result else '')
        overtimes.append(overtime.text.strip() if overtime else '')
        team_pts.append(team_pt.text.strip() if team_pt else '')
        opp_pts.append(opp_pt.text.strip() if opp_pt else '')
        wins.append(win.text.strip() if win else '')
        losses.append(loss.text.strip() if loss else '')
        game_streaks.append(game_streak.text.strip() if game_streak else '')
        notes.append(note.text.strip() if note else '')
    
    # Create a DataFrame for the team's games data
    columns = ['Game', 'Date', 'Start Time', 'Network', 'Visitor/Neutral', 'Visitor Points', 'Home/Neutral', 'Home Points',
               'Box Score Link', 'Arena', 'Game Result', 'Overtime', 'Team Points', 'Opponent Points', 'Wins', 'Losses',
               'Game Streak', 'Notes']
    df_team_games = pd.DataFrame({'Game': game_number, 'Date': dates, 'Start Time': start_times, 'Network': networks,
                                  'Visitor/Neutral': visitor_neutrals, 'Visitor Points': visitor_pts_list,
                                  'Home/Neutral': home_neutrals, 'Home Points': home_pts_list,
                                  'Box Score Link': box_score_links, 'Arena': arenas, 'Game Result': game_results,
                                  'Overtime': overtimes, 'Team Points': team_pts, 'Opponent Points': opp_pts,
                                  'Wins': wins, 'Losses': losses, 'Game Streak': game_streaks, 'Notes': notes},
                                 columns=columns)
    
    # Remove rows where the value in the 'G' column is "G"
    df_team_games = df_team_games[df_team_games['Game'] != 'G']
    
    # Create the directory if it doesn't exist
    directory = f"{year}/{team}"
    os.makedirs(directory, exist_ok=True)
    
    # Save the DataFrame as a CSV file in the specified directory
    csv_file_path = f"{directory}/{team}_{year}_games.csv"
    df_team_games.to_csv(csv_file_path, index=False)
    
    print(f"CSV file saved: {csv_file_path}")
    
    return df_team_games