# Import Libraries

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import time

# Scrape NBA Stats

Get url with 2018-2019 nba 3 point stats from Basketball Reference

In [4]:
get_headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
url  = 'https://www.basketball-reference.com/leagues/NBA_2019_totals.html#totals_stats::fg3'
response = requests.get(url, headers = get_headers, timeout=5)

Parse stats from Basketball Reference html into stats_list

In [5]:
stats_list = []
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id' : 'totals_stats'})
rows = table.find_all('tr', {'class': 'full_table'})
for row in rows:
    row_stats = []
    tds = row.find_all('td')
    for td in tds:
        if len(td.find_all('a')) > 0:
            row_stats.append(td.find('a').get_text())
        else:
            row_stats.append(td.get_text())
    stats_list.append(row_stats)

Parse column headers from Basketball Reference html into headers_list

In [6]:
columns_list = []
header = table.find('tr')
labels = header.find_all('th')
for label in labels:
    columns_list.append(label.get_text())

Create df from stats_list

In [7]:
columns_list.remove('Rk')
stats_df = pd.DataFrame(stats_list, columns=columns_list)
print('stats_df shape: ', stats_df.shape)
stats_df.head()

stats_df shape:  (530, 29)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,SG,25,OKC,31,2,588,56,157,0.357,...,0.923,5,43,48,20,17,6,14,53,165
1,Quincy Acy,PF,28,PHO,10,0,123,4,18,0.222,...,0.7,3,22,25,8,1,4,4,24,17
2,Jaylen Adams,PG,22,ATL,34,1,428,38,110,0.345,...,0.778,11,49,60,65,14,5,28,45,108
3,Steven Adams,C,25,OKC,80,80,2669,481,809,0.595,...,0.5,391,369,760,124,117,76,135,204,1108
4,Bam Adebayo,C,21,MIA,82,28,1913,280,486,0.576,...,0.735,165,432,597,184,71,65,121,203,729


Save df with player stats to csv

In [8]:
stats_df.to_csv('Data/player_stats.csv', index=False)

# Scrape NBA Combine Data

Create Selenium webdriver

In [11]:
driver_path = '/Users/jesseblant/Documents/chromedriver/chromedriver'
options = Options()
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

Create list of nba.com draft combine stat webpage urls to scrape

In [12]:
urls_list = []
for num in range(0,19):
    num2 = f'{num+1:02}'
    num = f'{num:02}'
    urls_list.append(
        f'https://www.nba.com/stats/draft/combine-anthro/?sort=HEIGHT_WO_SHOES&dir=1&SeasonYear=20{num}-{num2}')

Use selenium webdriver to get each nba draft combine url (This opens a window for each url, which allows nba.com to display its ad and the get to work)

In [13]:
combine_scrapes = []
for url in urls_list:
    driver.get(url)
    combine_scrapes.append([url, driver.page_source])

Parse draft combine measurements from combine_scrapes

In [14]:
combine_data = []
for combine_scrape in combine_scrapes:
    url = combine_scrape[0]
    soup = BeautifulSoup(combine_scrape[1], 'html.parser')
    table = soup.find('div', {'class': 'nba-stat-table__overflow'})
    rows = table.find_all('tr')
    for row in rows[1:]:
        datapoints = row.find_all('td')
        row_data = [url.split('=')[-1]]
        for datapoint in datapoints:
            row_data.append(datapoint.text)
        combine_data.append(row_data)
            

AttributeError: 'NoneType' object has no attribute 'find_all'

Parse draft combine datapoint headers from combine_scrapes

In [None]:
soup = BeautifulSoup(combine_scrapes[1][1], 'html.parser')
table = soup.find('div', {'class': 'nba-stat-table__overflow'})
row = table.find('tr')
fields = row.find_all('th')
headers = []
for field in fields:
    headers.append(field.text)

Clean combine datapoint headers

In [None]:
cleaned_headers = ['Year']
for header in headers:
    header = header.replace('\xa0', ' ')
    header = header.replace('\n', '')
    header = header.strip()
    header = header.title()
    cleaned_headers.append(header)

Create df from list of combine measurement datapoints

In [None]:
combine_df = pd.DataFrame(combine_data, columns=cleaned_headers)
print('combine_df shape: ', combine_df.shape)
combine_df.tail()

Save combine_df to csv

In [None]:
combine_df.to_csv('Data/combine_measurements.csv', index=False)

# Scrape NBA Salary Data

In [30]:
salary_url = 'https://hoopshype.com/salaries/players/2018-2019/'
salary_response = requests.get(salary_url, headers = get_headers, timeout=5)

In [47]:
salary_soup = BeautifulSoup(salary_response.content, 'html.parser')
salary_table = salary_soup.find('tbody')
salary_rows = salary_table.find_all('tr')
salary_data = []
for row in salary_rows:
    salary_row_data = []
    for datapoint in row.find_all('td'):
        row_data.append(datapoint.text.replace('\t', '').replace('\n', ''))
    data.append(row_data)

In [51]:
salary_df = pd.DataFrame(data, columns=['Index', 'Player', 'Salary', 'Inflation Adjusted Salary'])
salary_df.drop(columns=['Index'], inplace=True)
salary_df.head()

Unnamed: 0,Player,Salary,Inflation Adjusted Salary
0,Stephen Curry,"$37,457,154","$38,320,489"
1,Russell Westbrook,"$35,665,000","$36,487,029"
2,Chris Paul,"$35,654,150","$36,475,929"
3,LeBron James,"$35,654,150","$36,475,929"
4,Kyle Lowry,"$32,700,000","$33,453,690"
