In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
try:
    from curl_cffi import requests as crequests
except:
    ! pip install curl_cffi
    from curl_cffi import requests as crequests

#### Functions

In [2]:
def scrape_all_stars(str_url):
    # get request
    r = crequests.get(
        str_url,
        headers=BROWSER_HEADERS,
        impersonate="chrome",
        timeout=60,
    )
    r.raise_for_status()
    # decode
    if not r.encoding:
        r.encoding = r.apparent_encoding
    # get soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # get tables
    tables = soup.find_all("table")
    # remove the first
    tables = tables[1:]
    
    # loop through tables
    list_df = []
    for table in tables:
        # get body
        tbody = table.find("tbody")
        # get rows
        rows = tbody.find_all("tr")
        # get names
        list_dict_row = []
        for row in rows:
            # get name
            str_name = row.find_all('th')[0].get_text(strip=True)
            # row
            dict_row = {
                'player': str_name
            }
            # append
            if str_name not in ['Reserves','']:
                list_dict_row.append(dict_row)
        # make df
        df = pd.DataFrame(list_dict_row)
        # drop na
        df.dropna(axis=1, inplace=True)
        # make copy
        df = df.copy()
        # append
        list_df.append(df)
    
    # concat
    df = pd.concat(list_df)
    return df

#### Constants

In [3]:
str_dirname_output = './output'

int_time_sleep = 2

# headers
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.basketball-reference.com/",
    "Upgrade-Insecure-Requests": "1",
}

#### Output

In [4]:
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

#### Scrape NBA All-Stars

In [5]:
list_df = []
for a in tqdm(range(1980, 2025)):
    try:
        # url
        str_url = f'https://www.basketball-reference.com/allstar/NBA_{a}.html'
        # scrape draft
        df = scrape_all_stars(str_url=str_url)
        # make column for year
        df['year'] = a
        # make copy
        df = df.copy()
        # append
        list_df.append(df)
    except Exception as e:
        str_error = f'Error for year {a}: {e}'
        print(str_error)
    # pause
    time.sleep(int_time_sleep)

 42%|████▏     | 19/45 [00:40<00:55,  2.14s/it]

Error for year 1999: HTTP Error 404: 


100%|██████████| 45/45 [01:36<00:00,  2.16s/it]


#### Concatenate

In [6]:
df = pd.concat(list_df)

# show
df

Unnamed: 0,player,year
0,George Gervin,1980
1,Eddie Johnson,1980
2,Moses Malone,1980
3,Julius Erving,1980
4,John Drew,1980
...,...,...
7,Anthony Davis,2024
8,Devin Booker,2024
9,Anthony Edwards,2024
10,Kawhi Leonard,2024


#### Save

In [7]:
str_filename = 'df.csv'
str_filepath = f'{str_dirname_output}/{str_filename}'
df.to_csv(str_filepath, index=False)