# NBA DX project

## Player scraper

In [1]:
from bs4 import BeautifulSoup, Comment
import pathlib
import json
import time
from scraper.scraper import scrap_functions
from scraper.scraper import get_player_seasons
from scraper.url import get_seasons_url, get_players_url, get_player_soup
from tqdm import tqdm
import pandas as pd
import pickle

from const import *
from utils import *

In [2]:
# create PLAYERS_SUBDIR and URLS_SUBDIR if they don't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / PLAYERS_SUBDIR).mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / URLS_SUBDIR).mkdir(parents=True, exist_ok=True)

In [3]:
# Download HTML to scrape
seasons = get_seasons_url(START_YEAR, END_YEAR)

In [4]:
urls = []
# loop through the seasons
for season in seasons:
    # get season number from season url contained in season
    season_str = season_string_from_url(season)

    # if season urls have already been downloaded, skip
    if pathlib.Path(OUTPUT_DIR / URLS_SUBDIR / f'{season_str}.pkl').exists():
        # retrieve the urls from the pickle file
        with open(OUTPUT_DIR / URLS_SUBDIR / f'{season_str}.pkl', 'rb') as f:
            urls.extend(pickle.load(f))
    
    else:
        player_urls = get_players_url(season)

        # save player_urls to a file in URLS_DIR using pickle
        # if file does not exist, create it
        with open(OUTPUT_DIR / URLS_SUBDIR / f'{season_str}.pkl', 'wb') as f:
            pickle.dump(player_urls, f)

        # add new player_urls to urls
        urls.extend(player_urls)

urls = list(set(urls))

In [4]:
players_data = {}
for year in range(START_YEAR, END_YEAR + 1):
    players_data[str(year)] = {}

# loop through first 10 urls
for url in urls[:10]:
    # get the player's HTML page
    soup_file = get_player_soup(url)
    seasons = get_player_seasons(soup_file)

    for season_str in seasons:
        # Scrape HTML
        player = {}
        player[URL] = url
        
        year = season_number_from_string(season_str)
        if year >= START_YEAR and year <= END_YEAR:
            player[SEASON] = season_str
            
            for key, function in scrap_functions.items():
                player[key] = function(soup_file, season=season_str)

            players_data[str(year)][url] = player


# save each season to a different JSON file
for year in range(START_YEAR, END_YEAR + 1):
    destination = OUTPUT_DIR / f'{season_string_from_number(year)}.json'
    with open(str(destination), 'w') as f:
        print(f'Saving data to {destination}')
        json.dump(players_data[str(year)], f, indent=4, default=str)

Saving data to data/1959-60.json
Saving data to data/1960-61.json
Saving data to data/1961-62.json
Saving data to data/1962-63.json
Saving data to data/1963-64.json
Saving data to data/1964-65.json
Saving data to data/1965-66.json
Saving data to data/1966-67.json
Saving data to data/1967-68.json
Saving data to data/1968-69.json
Saving data to data/1969-70.json
Saving data to data/1970-71.json
Saving data to data/1971-72.json
Saving data to data/1972-73.json
Saving data to data/1973-74.json
Saving data to data/1974-75.json
Saving data to data/1975-76.json
Saving data to data/1976-77.json
Saving data to data/1977-78.json
Saving data to data/1978-79.json
Saving data to data/1979-80.json
Saving data to data/1980-81.json
Saving data to data/1981-82.json
Saving data to data/1982-83.json
Saving data to data/1983-84.json
Saving data to data/1984-85.json
Saving data to data/1985-86.json
Saving data to data/1986-87.json
Saving data to data/1987-88.json
Saving data to data/1988-89.json
Saving dat