# NBA stats scraper and analyzer

In [6]:
from bs4 import BeautifulSoup, Comment
import pathlib
import json
import time
from scraper.utils import find_text_of_p_with
from scraper.scraper import scrap_functions
from scraper.url import get_seasons_url, get_players_url, get_player_soup
from tqdm import tqdm

In [7]:
START_YEAR = 2010
END_YEAR = 2020
OUTPUT_DIR = pathlib.Path() / 'data'
OVERWRITE = True

# Set up constants
BASE_URL = 'https://www.basketball-reference.com'
SEASONS_URL = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
PLAYER_URL = 'https://www.basketball-reference.com'


In [8]:
# Download HTML to scrape
seasons = get_seasons_url(START_YEAR, END_YEAR)
# returns season URLs
print(seasons)

['https://www.basketball-reference.com/leagues/NBA_2010_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2011_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2012_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2013_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2014_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2015_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2016_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2017_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2019_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html']


In [9]:
# dataset textual constants
SEASON = 'season'
URL = 'url'

In [None]:
for season in seasons:
    # Get the players
    player_urls = get_players_url(season)

    season_year = int(season.split('_')[1])
    season_str = f'{season_year}-{(season_year + 1) % 100}'
    
    print(f'Processing season {season_year}, found {len(player_urls)} players')

    players_data = {}
    for url in tqdm(player_urls):
        # Get the player's HTML page
        soup_file = get_player_soup(url)
        
        # Scrape HTML
        player = {}
        player[URL] = url
        player[SEASON] = season_year
        for key, function in scrap_functions.items():
            player[key] = function(soup_file, season=season_str)

        players_data[url] = player
    
    # Save the data to a JSON file
    destination = OUTPUT_DIR / f'{season_year}.json'
    with open(str(destination), 'w') as f:
        print(f'Saving data to {destination}')
        json.dump(players_data, f, indent=4, default=str)