In [2]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [63]:
import pandas as pd
import re
from multiprocessing import Pool
import pickle
import time
import unicodedata

## Define elements for scraping

In [34]:
my_headers = {'Proxy-Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8,en-US;q=0.7'}
seasons = [season for season in range(2005, 2022)]

In [53]:
def all_player_salaries(season):
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    url = 'http://www.espn.com/nba/salaries/_/year/{0}'.format(season+1)
    
    response = session.get(url, headers=my_headers)
    response_status = re.search(r'\d+', str(response))[0]
    if response_status != '200': # handle fail case from response
        print(response_status)
        return None

    # parse
    html_soup = BeautifulSoup(response.text, 'html.parser')
    page_numbers = int(html_soup.find('div', {'class': 'page-numbers'}).text[-2:])
    
    table = pd.DataFrame()
    for page in range(1, page_numbers+1):
        url_page = 'http://www.espn.com/nba/salaries/_/year/{0}/page/{1}'.format(season+1, page)
        response_page = session.get(url_page, headers=my_headers)
        response_status = re.search(r'\d+', str(response_page))[0]
        if response_status != '200': # handle fail case from response
            print(response_status)
            print(url)
            continue
        
        soup_page = BeautifulSoup(response_page.text, 'html.parser')
        table = pd.concat([table, pd.read_html(str(soup_page.find_all('table')[0]))[0]])
    
    table = table.loc[table[0].apply(lambda x: x != 'RK')]
    table[3] = table[3].apply(lambda x: int(re.sub(r'[^\d.]', '', x)))
    table['season'] = str(season) + "-" + str(season+1)[2:]
    table.rename({1: 'Player', 3: 'Salary'}, inplace=True, axis=1)
    
    return table[['Player', 'Salary', 'season']]

In [54]:
with Pool(12) as p:
    player_salary = p.map(all_player_salaries, seasons)

In [56]:
player_salaries = pd.DataFrame()
for t in player_salary:
    player_salaries = pd.concat([player_salaries, t])

In [61]:
def fix_encoding(string):
    return unicodedata.normalize('NFKD', string).encode("ascii","ignore").decode()
def fix_dot(string):
    if string.count('.') > 1:
        clean = "".join(re.findall(r'[\w]+', string))
        return clean[:2] + " " + clean[2:]
    else:
        return string

In [64]:
player_salaries['Player'] = player_salaries['Player'].apply(lambda x: re.sub(r',.+', '', x)).apply(fix_encoding).apply(fix_dot)

In [67]:
with open('player_salaries.pickle', 'wb') as handle:
    pickle.dump(player_salaries, handle, protocol=pickle.HIGHEST_PROTOCOL)