In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# List of URLs
urls = [
    "https://www.basketball-reference.com/teams/CHI/2024.html",
    "https://www.basketball-reference.com/teams/LAL/2024.html",
    "https://www.basketball-reference.com/teams/BOS/2024.html",
    "https://www.basketball-reference.com/teams/BRK/2024.html",
    "https://www.basketball-reference.com/teams/TOR/2024.html",
    "https://www.basketball-reference.com/teams/PHI/2024.html",
    "https://www.basketball-reference.com/teams/NYK/2024.html",
    "https://www.basketball-reference.com/teams/CLE/2024.html",
    "https://www.basketball-reference.com/teams/DET/2024.html",
    "https://www.basketball-reference.com/teams/IND/2024.html",
    "https://www.basketball-reference.com/teams/MIL/2024.html",
    "https://www.basketball-reference.com/teams/WAS/2024.html",
    "https://www.basketball-reference.com/teams/ATL/2024.html",
    "https://www.basketball-reference.com/teams/ORL/2024.html",
    "https://www.basketball-reference.com/teams/MIA/2024.html",
    "https://www.basketball-reference.com/teams/CHO/2024.html",
    "https://www.basketball-reference.com/teams/MIN/2024.html",
    "https://www.basketball-reference.com/teams/OKC/2024.html",
    "https://www.basketball-reference.com/teams/DEN/2024.html",
    "https://www.basketball-reference.com/teams/POR/2024.html",
    "https://www.basketball-reference.com/teams/UTA/2024.html",
    "https://www.basketball-reference.com/teams/LAC/2024.html",
    "https://www.basketball-reference.com/teams/GSW/2024.html",
    "https://www.basketball-reference.com/teams/PHO/2024.html",
    "https://www.basketball-reference.com/teams/SAC/2024.html",
    "https://www.basketball-reference.com/teams/MEM/2024.html",
    "https://www.basketball-reference.com/teams/NOP/2024.html",
    "https://www.basketball-reference.com/teams/HOU/2024.html",
    "https://www.basketball-reference.com/teams/SAS/2024.html",
    "https://www.basketball-reference.com/teams/DAL/2024.html"
    # Add more team URLs as needed
]

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def scrape_stats(url):
    headers = {'User-Agent': user_agent}
    
    logging.info(f"Attempting to scrape data from {url}")
    
    try:
        response = requests.get(url, headers=headers, timeout=30)
        logging.info(f"Received response with status code: {response.status_code}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'per_game'})
            
            if table:
                columns_of_interest = {
                    "player": "Player", "g": "G", "trb_per_g": "TRB",
                    "ast_per_g": "AST", "stl_per_g": "STL", "blk_per_g": "BLK",
                    "tov_per_g": "TOV", "pts_per_g": "PTS"
                }
                
                rows = []
                for row in table.tbody.find_all('tr'):
                    row_data = {col_name: row.find('td', {'data-stat': stat}).text.strip() 
                                for stat, col_name in columns_of_interest.items() 
                                if row.find('td', {'data-stat': stat})}
                    if row_data:
                        rows.append(row_data)
                
                df = pd.DataFrame(rows)
                logging.info(f"Successfully extracted data for {len(df)} players.")
                return df
            else:
                logging.warning("Table not found on the webpage.")
                return pd.DataFrame()
        else:
            logging.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            return pd.DataFrame()
    except requests.Timeout:
        logging.error("Request timed out after 30 seconds.")
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
    
    return pd.DataFrame()

all_data = []

for url in urls:
    df = scrape_stats(url)
    if not df.empty:
        all_data.append(df)
    
    # Wait between 60 to 90 seconds before the next request
    delay = random.uniform(60, 90)
    logging.info(f"Waiting {delay:.2f} seconds before the next request...")
    time.sleep(delay)

if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    print(final_df)
    final_df.to_csv('nba_per_game_stats.csv', index=False)
    logging.info("Data saved to 'nba_per_game_stats.csv'")
else:
    logging.warning("No data was scraped. Please check the logs for errors.")

logging.info("Scraping process completed.")

2024-09-20 23:16:12,491 - INFO - Attempting to scrape data from https://www.basketball-reference.com/teams/CHI/2024.html
2024-09-20 23:16:13,020 - INFO - Received response with status code: 200
2024-09-20 23:16:13,157 - INFO - Successfully extracted data for 18 players.
2024-09-20 23:16:13,159 - INFO - Waiting 88.70 seconds before the next request...
2024-09-20 23:17:41,877 - INFO - Attempting to scrape data from https://www.basketball-reference.com/teams/LAL/2024.html
2024-09-20 23:17:42,191 - INFO - Received response with status code: 200
2024-09-20 23:17:42,390 - INFO - Successfully extracted data for 21 players.
2024-09-20 23:17:42,392 - INFO - Waiting 77.34 seconds before the next request...
2024-09-20 23:18:59,743 - INFO - Attempting to scrape data from https://www.basketball-reference.com/teams/BOS/2024.html
2024-09-20 23:19:00,107 - INFO - Received response with status code: 200
2024-09-20 23:19:00,287 - INFO - Successfully extracted data for 19 players.
2024-09-20 23:19:00,289

               Player   G   TRB  AST  STL  BLK  TOV   PTS
0       DeMar DeRozan  79   4.3  5.3  1.1  0.6  1.7  24.0
1          Coby White  79   4.5  5.1  0.7  0.2  2.1  19.1
2         Zach LaVine  25   5.2  3.9  0.8  0.3  2.1  19.5
3      Nikola Vučević  76  10.5  3.3  0.7  0.8  1.6  18.0
4         Ayo Dosunmu  76   2.8  3.2  0.9  0.5  1.4  12.2
..                ...  ..   ...  ...  ...  ...  ...   ...
652   Markieff Morris  26   1.5  0.6  0.2  0.1  0.5   2.5
653     Dexter Dennis   4   2.3  1.0  0.0  0.3  0.8   5.5
654       A.J. Lawson  42   1.2  0.5  0.2  0.1  0.3   3.2
655    Greg Brown III   6   1.5  0.7  0.0  0.7  0.7   2.5
656  Brandon Williams  17   0.8  1.0  0.1  0.1  0.4   3.2

[657 rows x 8 columns]


In [2]:
# <div class="table_container tabbed current is_setup" id="div_per_game">	