# NBA stats scraper and analyzer

In [1]:
from bs4 import BeautifulSoup, Comment
import pathlib
import json
import time
from scraper.utils import find_text_of_p_with, season_string
from scraper.scraper import scrap_functions, get_player_seasons
from scraper.url import get_seasons_url, get_players_url, get_player_soup
from tqdm import tqdm
import pandas as pd

In [2]:
# year: second half of the season - ex. 2010 --> 2009-10
START_YEAR = 2010
END_YEAR = 2020
OUTPUT_DIR = pathlib.Path() / 'data'
OVERWRITE = True

# dataset textual constants
SEASON = 'season'
URL = 'url'

# Set up constants
BASE_URL = 'https://www.basketball-reference.com'
SEASONS_URL = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
PLAYER_URL = 'https://www.basketball-reference.com'


In [3]:
# Download HTML to scrape
seasons = get_seasons_url(START_YEAR, END_YEAR)

['https://www.basketball-reference.com/leagues/NBA_2010_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2011_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2012_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2013_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2014_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2015_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2016_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2017_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2019_per_game.html', 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html']


In [5]:
# for season in seasons:
#     # Get the players
#     player_urls = get_players_url(season)

#     season_year = int(season.split('_')[1])
#     season_str = f'{season_year}-{(season_year + 1) % 100}'
    
#     print(f'Processing season {season_year}, found {len(player_urls)} players')

#     players_data = {}
#     for url in tqdm(player_urls):
#         # Get the player's HTML page
#         soup_file = get_player_soup(url)
        
#         # Scrape HTML
#         player = {}
#         player[URL] = url
#         player[SEASON] = season_year
#         for key, function in scrap_functions.items():
#             player[key] = function(soup_file, season=season_str)

#         players_data[url] = player
    
#     # Save the data to a JSON file
#     destination = OUTPUT_DIR / f'{season_year}.json'
#     with open(str(destination), 'w') as f:
#         print(f'Saving data to {destination}')
#         json.dump(players_data, f, indent=4, default=str)

In [6]:
# define a player_df dataframe with columns name, season, team, url
player_df = pd.DataFrame(columns=['name', 'season', 'team', 'url'])

urls = []
# loop through the seasons
for season in seasons:
    player_urls = get_players_url(season)

    # add new player_urls to urls
    urls.extend(player_urls)

urls = list(set(urls))

In [11]:
players_data = {}
for year in range(START_YEAR, END_YEAR):
    players_data[str(year)] = {}

# loop through first 10 urls
for url in urls[:10]:
    # get the player's HTML page
    soup_file = get_player_soup(url)

    # Scrape HTML
    player = {}
    player[URL] = url

    seasons = get_player_seasons(soup_file)

    for season_str in seasons:
        year = int(season_str.split('-')[0]) 
        if year >= START_YEAR and year < END_YEAR:
            player[SEASON] = season_str
            print(season_str)
            for key, function in scrap_functions.items():
                player[key] = function(soup_file, season=season_str)

            players_data[str(year)][url] = player


# # Save the data to a JSON file
# destination = OUTPUT_DIR / f'{START_YEAR}-{END_YEAR}.json'
# with open(str(destination), 'w') as f:
#     print(f'Saving data to {destination}')
#     json.dump(players_data, f, indent=4, default=str)

# save each season to a different JSON file
for year in range(START_YEAR, END_YEAR):
    destination = OUTPUT_DIR / f'{year-1}-{str(year)[2:]}.json'
    with open(str(destination), 'w') as f:
        print(f'Saving data to {destination}')
        json.dump(players_data[str(year)], f, indent=4, default=str)

2013-14
2016-17
2012-13
2015-16
2017-18
2016-17
2018-19
2017-18
2019-20
2013-14
2012-13
2010-11
2011-12
2013-14
2016-17
2012-13
2018-19
2015-16
2010-11
2014-15
2017-18
2011-12
2019-20
2011-12
2012-13
2015-16
2017-18
2013-14
2014-15
2019-20
2018-19
2010-11
2011-12
2016-17
2013-14
2012-13
2010-11
2011-12
2010-11
2011-12
2016-17
2015-16
2014-15
2013-14
2012-13
Saving data to data/2009-10.json
Saving data to data/2010-11.json
Saving data to data/2011-12.json
Saving data to data/2012-13.json
Saving data to data/2013-14.json
Saving data to data/2014-15.json
Saving data to data/2015-16.json
Saving data to data/2016-17.json
Saving data to data/2017-18.json
Saving data to data/2018-19.json
