# NBA stats scraper and analyzer

In [1]:
from bs4 import BeautifulSoup, Comment
import pathlib
import json
import time
from scraper.utils import find_text_of_p_with
from scraper.scraper import scrap_functions
from scraper.url import get_seasons_url, get_players_url, get_player_soup
from tqdm import tqdm

In [2]:
START_YEAR = 2010
END_YEAR = 2020
OUTPUT_DIR = pathlib.Path() / 'data'
OVERWRITE = True

# Set up constants
BASE_URL = 'https://www.basketball-reference.com'
SEASONS_URL = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
PLAYER_URL = 'https://www.basketball-reference.com'


In [3]:
# Download HTML to scrape
seasons = get_seasons_url(START_YEAR, END_YEAR)
for season in seasons:
    # Get the players
    player_urls = get_players_url(season)
    

    season_year = int(season.split('_')[1])
    season_str = f'{season_year}-{(season_year + 1) % 100}'
    
    print(f'Processing season {season_year}, found {len(player_urls)} players')

    players_data = {}
    for url in tqdm(player_urls):

        players_data[url] = {}
        players_data[url]['season'] = season_year
        players_data[url]['player'] = url
        soup_file = get_player_soup(url)
        
        # Scrape HTML
        for key, function in scrap_functions.items():
            players_data[url][key] = function(soup_file, season=season_str)
            time.sleep(0.01)

        time.sleep(0.1)
    
    # Save the data to a JSON file
    destination = OUTPUT_DIR / f'{season_year}.json'
    with open(str(destination), 'w') as f:
        print(f'Saving data to {destination}')
        json.dump(players_data, f, indent=4, default=str)



Processing season 2010, found 401 players


 19%|█▉        | 76/401 [02:33<10:06,  1.87s/it]