In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing import Pool
import re
import pickle
import numpy as np

In [8]:
def fix_asterisk(string):
    return string.replace('*', '')

def scrape_fa(year):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    my_headers = {
        'authority': 'en.wikipedia.org',
        'cache-control': 'max-age=0',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Microsoft Edge";v="96"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8,en-US;q=0.7',
        'cookie': 'WMF-Last-Access=09-Dec-2021; WMF-Last-Access-Global=09-Dec-2021; GeoIP=US:::37.75:-97.82:v4; enwikimwuser-sessionId=c57e79277dd10e90044c; enwikiel-sessionId=361cd58612199630ad70; enwikiwmE-sessionTickLastTickTime=1639063998485; enwikiwmE-sessionTickTickCount=10',
    }
    
    url = 'https://en.wikipedia.org/wiki/List_of_{0}%E2%80%93{1}_NBA_season_transactions'.format(year, str(year+1)[2:])

    response = session.get(url, headers=my_headers)
    print(response)
    soup = BeautifulSoup(response.text)
    
    if year == 2010:
        signed = soup.find_all('table', {'class': 'wikitable'})[5]
        signed_players = list(pd.read_html(str(signed))[0]['Signed in the off-season'].values)
        
        waived = soup.find_all('table', {'class': 'wikitable'})[7]
        waived_players = list(pd.read_html(str(waived))[0]['Player'].values)
    elif year == 2009:
        signed = soup.find_all('table', {'class': 'wikitable'})[5]
        signed_players = list(pd.read_html(str(signed))[0]['Signed in the off-season'].values)
        
        waived = soup.find_all('table', {'class': 'wikitable'})[7]
        waived_players = list(pd.read_html(str(waived))[0]['Player'].values)
    else:
        try:
            for t in soup.find_all('table', {'class': 'wikitable sortable'}):
                ths = str(t.find_all('th'))
                if ('Date signed' in ths) and ('New team' in ths) and ('Former team' in ths):
                    signed_players = np.unique(list(pd.read_html(str(t))[0]['Player'].values))
                if ('Waived' in ths) or ('waived' in ths):
                    waived_players = np.unique(list(pd.read_html(str(t))[0]['Player'].values))
        except KeyError:
            print(url)
            return None
    try:
        signed_players, waived_players = [fix_asterisk(signed_player) for signed_player in signed_players], [fix_asterisk(waived_player) for waived_player in waived_players]
        players = [signed_player for signed_player in signed_players if signed_player not in waived_players]
        if year == 2012:
            print('Elton Brand' in waived_players)
        print(url + " " + str(len(signed_players)) + " " + str(len(waived_players)) + " " + str(len(players)))
        df = pd.DataFrame({'Player': players})
        df['previous season'] = str(year-1) + "-" + str(year)[2:]
    except (UnboundLocalError, AttributeError, NameError) as err:
        print(url)
        print(err)
        return None

    return df

In [9]:
with Pool(12) as p:
    free_agents = p.map(scrape_fa, list(range(2005, 2022)))

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
https://en.wikipedia.org/wiki/List_of_2006%E2%80%9307_NBA_season_transactions 76 141 61
https://en.wikipedia.org/wiki/List_of_2008%E2%80%9309_NBA_season_transactions 35 28 35
<Response [200]><Response [200]>

<Response [200]>
https://en.wikipedia.org/wiki/List_of_2007%E2%80%9308_NBA_season_transactions 192 70 150
https://en.wikipedia.org/wiki/List_of_2005%E2%80%9306_NBA_season_transactions 167 180 96
https://en.wikipedia.org/wiki/List_of_2009%E2%80%9310_NBA_season_transactions 123 66 86
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
https://en.wikipedia.org/wiki/List_of_2010%E2%80%9311_NBA_season_transactions 163 64 116
<Response [200]>
True
https://en.wikipedia.org/wiki/List_of_2012%E2%80%9313_NBA_season_transactions 300 77 252
https://en.wikipedia.org/wiki/List_of_2015%E2%80%9316_NBA_season_transactions 320 67 275
https://en.wikipedia.org/wiki/List_of_2016%E2%80%9317_NBA_season_t

In [10]:
free_agents = pd.concat(free_agents, axis=0)

In [16]:
free_agents

Unnamed: 0,Player,previous season
0,Alan Henderson,2004-05
1,Alejandro Sánchez,2004-05
2,Alex Garcia,2004-05
3,Anthony Carter,2004-05
4,Antonio Daniels,2004-05
...,...,...
289,Yves Pons,2020-21
290,Zach Collins,2020-21
291,Zavier Simpson,2020-21
292,Zylan Cheatham,2020-21


In [15]:
with open('free_agents.pickle', 'wb') as handle:
    pickle.dump(free_agents, handle, protocol=pickle.HIGHEST_PROTOCOL)