# Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import time

# Scrape NBA Stats

Get url with 2018-2019 nba 3 point stats from Basketball Reference

In [2]:
get_headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
url  = 'https://www.basketball-reference.com/leagues/NBA_2019_totals.html#totals_stats::fg3'
response = requests.get(url, headers = get_headers, timeout=5)

Parse stats from Basketball Reference html into stats_list

In [3]:
stats_list = []
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id' : 'totals_stats'})
rows = table.find_all('tr', {'class': 'full_table'})
for row in rows:
    row_stats = []
    tds = row.find_all('td')
    for td in tds:
        if len(td.find_all('a')) > 0:
            row_stats.append(td.find('a').get_text())
        else:
            row_stats.append(td.get_text())
    stats_list.append(row_stats)

Parse column headers from Basketball Reference html into headers_list

In [4]:
columns_list = []
header = table.find('tr')
labels = header.find_all('th')
for label in labels:
    columns_list.append(label.get_text())

Create df from stats_list

In [5]:
columns_list.remove('Rk')
stats_df = pd.DataFrame(stats_list, columns=columns_list)
print('stats_df shape: ', stats_df.shape)
stats_df.head()

stats_df shape:  (530, 29)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,SG,25,OKC,31,2,588,56,157,0.357,...,0.923,5,43,48,20,17,6,14,53,165
1,Quincy Acy,PF,28,PHO,10,0,123,4,18,0.222,...,0.7,3,22,25,8,1,4,4,24,17
2,Jaylen Adams,PG,22,ATL,34,1,428,38,110,0.345,...,0.778,11,49,60,65,14,5,28,45,108
3,Steven Adams,C,25,OKC,80,80,2669,481,809,0.595,...,0.5,391,369,760,124,117,76,135,204,1108
4,Bam Adebayo,C,21,MIA,82,28,1913,280,486,0.576,...,0.735,165,432,597,184,71,65,121,203,729


Save df with player stats to csv

In [6]:
stats_df.to_csv('Data/player_stats.csv', index=False)

# Scrape NBA Combine Data

Create Selenium webdriver

In [7]:
driver_path = '/Users/jesseblant/Documents/chromedriver/chromedriver'
options = Options()
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

Create list of nba.com draft combine stat webpage urls to scrape

In [8]:
urls_list = []
for num in range(0,19):
    num2 = f'{num+1:02}'
    num = f'{num:02}'
    urls_list.append(
        f'https://www.nba.com/stats/draft/combine-anthro/?sort=HEIGHT_WO_SHOES&dir=1&SeasonYear=20{num}-{num2}')

Use selenium webdriver to get each nba draft combine url (This opens a window for each url, which allows nba.com to display its ad and the get to work)

In [9]:
combine_scrapes = []
for url in urls_list:
    driver.get(url)
    combine_scrapes.append([url, driver.page_source])

Parse draft combine measurements from combine_scrapes

In [10]:
combine_data = []
for combine_scrape in combine_scrapes:
    url = combine_scrape[0]
    soup = BeautifulSoup(combine_scrape[1], 'html.parser')
    table = soup.find('div', {'class': 'nba-stat-table__overflow'})
    rows = table.find_all('tr')
    for row in rows[1:]:
        datapoints = row.find_all('td')
        row_data = [url.split('=')[-1]]
        for datapoint in datapoints:
            row_data.append(datapoint.text)
        combine_data.append(row_data)
            

Parse draft combine datapoint headers from combine_scrapes

In [11]:
soup = BeautifulSoup(combine_scrapes[1][1], 'html.parser')
table = soup.find('div', {'class': 'nba-stat-table__overflow'})
row = table.find('tr')
fields = row.find_all('th')
headers = []
for field in fields:
    headers.append(field.text)

Clean combine datapoint headers

In [12]:
cleaned_headers = ['Year']
for header in headers:
    header = header.replace('\xa0', ' ')
    header = header.replace('\n', '')
    header = header.strip()
    header = header.title()
    cleaned_headers.append(header)

Create df from list of combine measurement datapoints

In [13]:
combine_df = pd.DataFrame(combine_data, columns=cleaned_headers)
print('combine_df shape: ', combine_df.shape)
combine_df.tail()

combine_df shape:  (1273, 11)


Unnamed: 0,Year,Player,Pos,Body Fat %,Hand Length (Inches),Hand Width (Inches),Height W/O Shoes,Height W/ Shoes,Standing Reach,Weight (Lbs),Wingspan
1268,2018-19,Trae Young,PG,5.35%,8.0,9.25,6' 0.5'',6' 1.75'',7' 11.5'',177.8,6' 3''
1269,2018-19,Jevon Carter,PG,4.15%,9.25,8.5,6' 0.25'',6' 1.5'',7' 11'',196.2,6' 4.25''
1270,2018-19,Devonte Graham,PG,4.80%,8.5,9.5,6' 0.25'',6' 1.5'',8' 0'',186.4,6' 6.25''
1271,2018-19,Aaron Holiday,PG,6.40%,8.75,8.5,5' 11.75'',6' 0.75'',8' 1'',187.0,6' 7.5''
1272,2018-19,Carsen Edwards,PG,6.30%,8.5,9.25,5' 10.75'',6' 0'',7' 10.5'',195.6,6' 6.25''


Save combine_df to csv

In [14]:
combine_df.to_csv('Data/combine_measurements.csv', index=False)