# NBA stats scraper and analyzer

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
import os
import sys
import json
import argparse
import logging
import logging.config
import yaml
import datetime
import csv
import glob
import pathlib

In [3]:
START_YEAR = 2010
END_YEAR = 2020
OUTPUT_DIR = pathlib.Path() / 'data'
OVERWRITE = True

# Set up constants
BASE_URL = 'https://www.basketball-reference.com'
SEASONS_URL = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
PLAYER_URL = 'https://www.basketball-reference.com'

# Set up dataframes
df_players = pd.DataFrame()
df_seasons = pd.DataFrame()
df_games = pd.DataFrame()


In [6]:

# Set up regex
re_player = re.compile(r'/players/([a-z])/([a-z]{5})([0-9]{2})\.html')
re_season = re.compile(r'/leagues/NBA_([0-9]{4})_per_game\.html')

# Set up headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

# Set up session
session = requests.Session()
session.headers.update(headers)

# Set up functions
def get_seasons():
    """Get a list of seasons to scrape"""
    seasons = []
    for year in range(START_YEAR, END_YEAR + 1):
        seasons.append(SEASONS_URL.format(year))
    return seasons

def get_players(season):
    """Get a list of players to scrape"""
    players = []

    print(f'Getting players for {season}')
    r = session.get(season)
    soup = BeautifulSoup(r.text, 'html.parser')
    for a in soup.find_all('a', href=re_player):
        players.append(a['href'])
    return players

def get_player_data(player):
    """Get the player data"""
    print(f'Getting data for {player}')
    url = PLAYER_URL + player
    print(url)
    r = session.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

def get_player_info(player, soup):
    """Get the player info"""
    print(f'Getting info for {player}')
    
    info = {}
    for tr in soup.find_all('tr'):
        if tr.find('th', {'data-stat': 'season'}) is not None:
            season = tr.find('th', {'data-stat': 'season'}).text
            info[season] = {}
            for td in tr.find_all('td'):
                info[season][td['data-stat']] = td.text
    return info

def get_player_seasons(player, soup):
    """Get the player seasons"""
    print(f'Getting seasons for {player}')
    
    seasons = []
    for tr in soup.find_all('tr'):
        if tr.find('th', {'data-stat': 'season'}) is not None:
            season = tr.find('th', {'data-stat': 'season'}).text
            if '-' in season:
                seasons.append(season)
    
    # Remove duplicates
    return list(set(seasons))

def get_player_games(player, soup):
    """Get the player games"""
    print(f'Getting games for {player}')
    
    games = []
    for tr in soup.find_all('tr'):
        if tr.find('th', {'data-stat': 'game_num'}) is not None:
            game = {}
            for td in tr.find_all('td'):
                game[td['data-stat']] = td.text
            games.append(game)
    return games

def get_player_name(player, soup):
    """Get the player name"""
    print(f'Getting name for {player}')

    res = soup.find('div', {'id': 'meta'}).findAll('div')[1].find('h1').find('span').text
    print(res)
    return res


def get_player_height(player, soup):
    """Get the player height"""
    print(f'Getting height for {player}')
    
    # Find any nested <p> with 'cm,' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'cm,' in p.text:
            return int(p.text.split('cm,')[0][-3:])


def get_player_weight(player, soup):
    """Get the player weight"""
    print(f'Getting weight for {player}')
    # Find any nested <p> with 'cm,' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'cm,' in p.text:
            return int(p.text.split('kg)')[0][-3:])


def get_player_birth_date(player, soup):
    """Get the player birth date"""
    print(f'Getting birth date for {player}')
    # Click the button with id meta_more_button
    try:
        soup.find('button', {'id': 'meta_more_button'}).click()
    except:
        pass

    # Find any nested <p> with 'Born' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'Born' in p.text:
            # Remove whitespaces and newlines from p.text
            text = ' '.join(p.text.split())
            str_date = re.search(r'(?<=Born: ).*(\s[0-9]{4})', text)[0]

            # Convert to datetime
            return datetime.datetime.strptime(str_date, '%B %d, %Y').date()


def get_player_birth_place(player, soup):
    """Get the player birth place"""
    print(f'Getting birth place for {player}')
    # Click the button with id meta_more_button
    try:
        soup.find('button', {'id': 'meta_more_button'}).click()
    except:
        pass

    # Find any nested <p> with 'Born' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'Born' in p.text:
            # Remove whitespaces and newlines from p.text
            text = ' '.join(p.text.split())

            # Find text after 'in'
            str_birth = re.search(r'(?<=in ).*', text)[0]
            print(str_birth)
            return str_birth


def get_player_college(player, soup):
    """Get the player college"""
    print(f'Getting college for {player}')
    # Click the button with id meta_more_button
    try:
        soup.find('button', {'id': 'meta_more_button'}).click()
    except:
        pass

    # Find any nested <p> with 'Born' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'College' in p.text:
            # Remove whitespaces and newlines from p.text
            text = ' '.join(p.text.split())

            college = text.split('College: ')[1]
            return college


def get_player_draft(player, soup):
    # Click the button with id meta_more_button
    try:
        soup.find('button', {'id': 'meta_more_button'}).click()
    except:
        pass

    # Find any nested <p> with 'Born' inside
    ps = soup.findAll('p')
    for p in ps:
        if 'Draft' in p.text:
            # Remove whitespaces and newlines from p.text
            text = ' '.join(p.text.split())

            draft = text.split('Draft: ')[1]
            return draft


def get_player_salary(player, soup):
    """Get the player salary"""
    print(f'Getting salary for {player}')

    try:
        soup.find('button', {'id': 'meta_more_button'}).click()
    except:
        pass

    # Load element with XPath //*[@id="all_salaries"]/tfoot
    salaries = soup.find('tfoot').findAll('tr')[1].findAll('td')[1].text
    
    return salaries

def get_player_experience(player, soup):
    """Get the player experience"""
    print(f'Getting experience for {player}')
    
    return soup.find('span', {'itemprop': 'experienceRequirements'}).text

def get_player_position(player, soup):
    """Get the player position"""
    print(f'Getting position for {player}')
    
    return soup.find('span', {'itemprop': 'position'}).text

def get_player_team(player, soup):
    """Get the player team"""
    print(f'Getting team for {player}')
    
    return soup.find('span', {'itemprop': 'memberOf'}).text

def get_player_stats(player, soup):
    """Get the player stats"""
    print(f'Getting stats for {player}')
    
    stats = {}
    for tr in soup.find_all('tr'):
        if tr.find('th', {'data-stat': 'season'}) is not None:
            season = tr.find('th', {'data-stat': 'season'}).text
            stats[season] = {}
            for td in tr.find_all('td'):
                stats[season][td['data-stat']] = td.text
    return stats

# Regroup functions in a dict
functions = {
    'name': get_player_name,
    'info': get_player_info,
    'seasons': get_player_seasons,
    'games': get_player_games,
    'height': get_player_height,
    'weight': get_player_weight,
    'birth_date': get_player_birth_date,
    'birth_place': get_player_birth_place,
    'college': get_player_college,
    'draft': get_player_draft,
    'salary': get_player_salary,
    'experience': get_player_experience,
    'position': get_player_position,
    'team': get_player_team,
    'stats': get_player_stats
}


In [7]:
# Download HTML to scrape
seasons = get_seasons()
for season in seasons:
    
    # Get the players
    players = get_players(season)

    season_year = season.split('_')[1]
    
    
    players_data = {}
    for player in players:
        print('player', player)
        players_data[player] = {}
        players_data[player]['season'] = season_year
        players_data[player]['player'] = player
        soup_file = get_player_data(player)
        
        # Scrape HTML
        for key, function in functions.items():
            players_data[player][key] = function(player, soup_file)
            #print(players_data[player][key])

    
    # Save the data to a JSON file
    destination = OUTPUT_DIR / f'{season_year}.json'
    with open(str(destination), 'w') as f:
        print(f'Saving data to {destination}')
        json.dump(players_data, f, indent=4)





Getting players for https://www.basketball-reference.com/leagues/NBA_2010_per_game.html
player /players/g/gayru01.html
Getting data for /players/g/gayru01.html
https://www.basketball-reference.com/players/g/gayru01.html
Getting name for /players/g/gayru01.html
Rudy Gay
Getting info for /players/g/gayru01.html
Getting seasons for /players/g/gayru01.html
Getting games for /players/g/gayru01.html
Getting height for /players/g/gayru01.html
Getting weight for /players/g/gayru01.html
Getting birth date for /players/g/gayru01.html
Getting birth place for /players/g/gayru01.html
Brooklyn, New York us
Getting college for /players/g/gayru01.html
Getting salary for /players/g/gayru01.html


AttributeError: 'NoneType' object has no attribute 'find'