In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
try:
    from curl_cffi import requests as crequests
except:
    ! pip install curl_cffi
    from curl_cffi import requests as crequests

#### Functions

In [2]:
def scrape_draft(str_url):
    # get request
    r = crequests.get(
        str_url,
        headers=BROWSER_HEADERS,
        impersonate="chrome",
        timeout=60,
    )
    r.raise_for_status()
    # decode
    if not r.encoding:
        r.encoding = r.apparent_encoding
    # get soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # get the table
    table = soup.find("table", id="stats")
    # get the body
    tbody = table.find("tbody")
    # get rows
    rows = tbody.find_all("tr")
    # loop through rows
    list_dict_row = []
    for row in rows:
        # get columns
        tds = row.find_all(['td'])
        # create a row
        dict_row = {}
        for td in tds:
            # get stat
            str_stat = td.get('data-stat')
            # get value
            str_value = td.get_text(strip=True)
            # assign
            dict_row[str_stat] = str_value
        # append
        list_dict_row.append(dict_row)
    
    # make df
    df = pd.DataFrame(list_dict_row)
    return df

#### Constants

In [3]:
str_dirname_output = './output'

int_time_sleep = 2

# headers
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.basketball-reference.com/",
    "Upgrade-Insecure-Requests": "1",
}

#### Output

In [4]:
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

#### Scrape NBA Drafts

In [5]:
list_df = []
for a in tqdm(range(1980, 2016)):
    # url
    str_url = f'https://www.basketball-reference.com/draft/NBA_{a}.html'
    # scrape draft
    df = scrape_draft(str_url=str_url)
    # make column for year
    df['year'] = a
    # make copy
    df = df.copy()
    # append
    list_df.append(df)
    # pause
    time.sleep(int_time_sleep)

100%|██████████| 36/36 [01:21<00:00,  2.27s/it]


#### Concatenate

In [6]:
df = pd.concat(list_df)

# show
df

Unnamed: 0,pick_overall,team_id,player,college_name,seasons,g,mp,pts,trb,ast,...,mp_per_g,pts_per_g,trb_per_g,ast_per_g,ws,ws_per_48,bpm,vorp,year,skip
0,1,GSW,Joe Barry Carroll,Purdue,10,705,22838,12455,5404,1264,...,32.4,17.7,7.7,1.8,35.6,.075,-0.6,8.2,1980,
1,2,UTA,Darrell Griffith,Louisville,10,765,21403,12391,2519,1627,...,28.0,16.2,3.3,2.1,22.0,.049,-1.4,3.4,1980,
2,3,BOS,Kevin McHale,Minnesota,13,971,30118,17335,7122,1670,...,31.0,17.9,7.3,1.7,113.0,.180,2.5,34.3,1980,
3,4,CHI,Kelvin Ransey,Ohio State,6,474,11586,5380,901,2480,...,24.4,11.4,1.9,5.2,17.9,.074,-0.9,3.3,1980,
4,5,DEN,James Ray,Jacksonville University,3,103,843,334,228,76,...,8.2,3.2,2.2,0.7,-0.3,-.017,-4.5,-0.6,1980,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,56,NOP,Branden Dawson,Michigan State,1,6,29,5,4,0,...,4.8,0.8,0.7,0.0,0.0,.069,-5.7,0.0,2015,
58,57,DEN,Nikola Radičević,,,,,,,,...,,,,,,,,,2015,
59,58,PHI,J.P. Tokoto,UNC,,,,,,,...,,,,,,,,,2015,
60,59,ATL,Dimitrios Agravanis,,,,,,,,...,,,,,,,,,2015,


#### Save

In [7]:
str_filename = 'df.csv'
str_filepath = f'{str_dirname_output}/{str_filename}'
df.to_csv(str_filepath, index=False)