# Scraping hoophype.com to gather player salary data

In this notebook, we utilize beautiful soup to gather player salary data, including the percentage the players contract is relative to the overall team salary. 

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep

import requests

In [2]:
Teams = {'BOS':'Boston Celtics','ATL': 'Atlanta Hawks','BRK':'Brooklyn Nets','CHI': 'Chicago Bulls',
        'CHO':'Charlotte Hornets','CLE':'Cleveland Cavaliers','DAL':'Dallas Mavericks','DEN':'Denver Nuggets',
        'DET':'Detroit Pistons','GSW':'Golden State Warriors','HOU':'Houston Rockets','IND':'Indiana Pacers',
        'LAC':'Los Angeles Clippers','LAL':'Los Angeles Lakers','MEM':'Memphis Grizzlies','MIA':'Miami Heat',
        'MIL':'Milwaukee Bucks','MIN':'Minnesota Timberwolves','NOP':'New Orleans Pelicans','NYK':'New York Knicks',
        'OKC':'Oklahoma City Thunder','ORL':'Orlando Magic','PHI':'Philadelphia 76ers','PHO':'Phoenix Suns',
        'POR':'Portland Trail Blazers','SAC':'Sacramento Kings','SAS':'San Antonio Spurs','TOR':'Toronto Raptors',
        'UTA':'Utah Jazz','WAS':'Washington Wizards'}

In [9]:
def team_name_edit(x):
    x = x.lower()
    x = x.replace(" ", "_")
    return(x) 

def util_func(x):
    try: return(float(x))
    except: pass

    
def scrape_salaries(team_name,year,key):

    sleep(1)

    print('Working on the ' + str(team_name) + ' in the ' + str(year) + ' season')
    year_list = [int(year),int(year)+1]
        
    url = "https://hoopshype.com/salaries/"+ str(team_name)+ "/"+ str(year_list[0]) + "-" + str(year_list[1])+ "/"

    response = requests.get(url= url)
            
    if response.status_code == 200:
        print('')
    else:
        print('Failed to load webpage')

    ## create the soup object, parsing the html code
    soup = BeautifulSoup(response.text, 'html.parser')

    # data = soup.find_all(id = 'content-container') 
    ## this gets me in the right spot. I now need to pull relevent information
    data = soup.find_all("div", class_= "payroll-team")[0] #this returns a list since it 
    # finds all of them, so [0] on the end for the first instance (can use .find())

    # in this first line, I go to the <td -- > tag, which houses 
    # the salaries of the team. data_2 will be a list of times it find the tag <td>
    # in the html code.

    data_2 = data.find_all('td', attrs={'class':"name"})

    # create a list for names
    name_list = list()

    for i in range(1,len(data_2)-1):
        # data_2[i] grabs the ith item from the list 
        # data_2[i].a grabs the <a> tag from that <td> parent
        # data_2[i].a.contents grabs the text contents from that <a> tag in a list
        # data_2[i].a.contents[0].strip() takes the first element of the list
        # and strips the \n text, leaving only the player name.
        try:
            name = data_2[i].a.contents[0].strip() 
            name_list.append(name)
        except:
            name = data_2[i].contents[0].strip() ## some names aren't in an <a> tag.
            if name != '':
                name_list.append(name) 
    num_names = len(name_list)

    data_3 = data.find_all('td', style="color:black", class_ = "")
    # <td style="color:black" class="hh-salaries-sorted" data-value="30351780">
    # 					$30,351,780				</td>

    salaries = list()
    for value in data_3:
        try:
            salaries.append(value['data-value'])
        except:
            continue

    ## we want even indices in the salaries list since the website
    ## lists the salaries as year, and adjusted for inflation. 
    ## we want even indices in the salaries list since the website
    ## lists the salaries as year, and adjusted for inflation. 
    
    salaries_trimmed = [i for i in salaries if i != ''] # trims off empty salaries

    ## grabs the even indices since salaries are in two columns
    even_indices = [i for i in range(len(salaries_trimmed)) if i%2 == 0]  

    ## for those names without salaries, we put 0.0
    empty_salaries = [float(0) for i in salaries if i == '']

    ## turning the salaries from strings to floats
    salaries_updated = [util_func(salaries[i]) for i in even_indices]

    #updating the salaries with the zero salaries
    salaries_updated = salaries_updated + empty_salaries

    # takes salaries only if there is a name associated with them 
    salaries_updated = [salaries_updated[i] for i in range(len(name_list)) ]
        
    #determining the percent of salries for each player. 
    total_salary = sum(salaries_updated)
    percent_salary = [util_func(salaries_updated[i]) / total_salary for i in range(num_names)]
 
    year = [float(year) for i in range(num_names)]
    team = [key for i in range(num_names)]

    sleep(.5)
    
    df = pd.DataFrame({'Year':year,'Team':team,'Player':name_list,'Salary':salaries_updated, 'Percent_team_salary':percent_salary})
    
    return(df)

# Below, we scrape the data for five years at a time, as to not overload the website with requests. At the end, we export the entire compiled dataframe. 

In [10]:
salary_df = pd.DataFrame()
n = 5
years = [str(1991 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

Working on the boston_celtics in the 1991 season

Working on the boston_celtics in the 1992 season

Working on the boston_celtics in the 1993 season

Working on the boston_celtics in the 1994 season

Working on the boston_celtics in the 1995 season

Working on the atlanta_hawks in the 1991 season

Working on the atlanta_hawks in the 1992 season

Working on the atlanta_hawks in the 1993 season

Working on the atlanta_hawks in the 1994 season

Working on the atlanta_hawks in the 1995 season

Working on the brooklyn_nets in the 1991 season

Working on the brooklyn_nets in the 1992 season

Working on the brooklyn_nets in the 1993 season

Working on the brooklyn_nets in the 1994 season

Working on the brooklyn_nets in the 1995 season

Working on the chicago_bulls in the 1991 season

Working on the chicago_bulls in the 1992 season

Working on the chicago_bulls in the 1993 season

Working on the chicago_bulls in the 1994 season

Working on the chicago_bulls in the 1995 season

Working on the 

In [15]:
salary_91_95 = salary_df.copy()

In [16]:
salary_df = pd.DataFrame()
n = 5
years = [str(1996 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

Working on the boston_celtics in the 1996 season

Working on the boston_celtics in the 1997 season

Working on the boston_celtics in the 1998 season

Working on the boston_celtics in the 1999 season

Working on the boston_celtics in the 2000 season

Working on the atlanta_hawks in the 1996 season

Working on the atlanta_hawks in the 1997 season

Working on the atlanta_hawks in the 1998 season

Working on the atlanta_hawks in the 1999 season

Working on the atlanta_hawks in the 2000 season

Working on the brooklyn_nets in the 1996 season

Working on the brooklyn_nets in the 1997 season

Working on the brooklyn_nets in the 1998 season

Working on the brooklyn_nets in the 1999 season

Working on the brooklyn_nets in the 2000 season

Working on the chicago_bulls in the 1996 season

Working on the chicago_bulls in the 1997 season

Working on the chicago_bulls in the 1998 season

Working on the chicago_bulls in the 1999 season

Working on the chicago_bulls in the 2000 season

Working on the 

In [19]:
salary_df_updated = pd.concat([salary_91_95,salary_df])

Unnamed: 0,Year,Team,Player,Salary,Percent_team_salary
0,1991.0,BOS,Larry Bird,7070000.0,0.278972
1,1991.0,BOS,Kevin McHale,3500000.0,0.138105
2,1991.0,BOS,Reggie Lewis,3340000.0,0.131792
3,1991.0,BOS,Robert Parish,3000000.0,0.118376


In [22]:
salary_df_updated.sample(10)

Unnamed: 0,Year,Team,Player,Salary,Percent_team_salary
1,1994.0,POR,Chris Dudley,3500000.0,0.131293
4,1992.0,DET,Terry Mills,1337000.0,0.095815
14,1996.0,MIA,James Scott,220000.0,0.009812
1,1999.0,PHI,Theo Ratliff,7031250.0,0.160199
3,1995.0,PHI,Jeff Malone,2380000.0,0.094696
10,1999.0,POR,Greg Anthony,1100000.0,0.014885
0,1997.0,MIN,Tom Gugliotta,5500000.0,0.197929
3,1996.0,LAC,Lamond Murray,2700000.0,0.115699
15,1992.0,DAL,Walter Palmer,140000.0,0.012893
3,1999.0,BOS,Dana Barros,3500000.0,0.075835


In [23]:
salary_df = pd.DataFrame()
n = 5
years = [str(2001 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

salary_df_updated = pd.concat([salary_df_updated,salary_df])

Working on the boston_celtics in the 2001 season

Working on the boston_celtics in the 2002 season

Working on the boston_celtics in the 2003 season

Working on the boston_celtics in the 2004 season

Working on the boston_celtics in the 2005 season

Working on the atlanta_hawks in the 2001 season

Working on the atlanta_hawks in the 2002 season

Working on the atlanta_hawks in the 2003 season

Working on the atlanta_hawks in the 2004 season

Working on the atlanta_hawks in the 2005 season

Working on the brooklyn_nets in the 2001 season

Working on the brooklyn_nets in the 2002 season

Working on the brooklyn_nets in the 2003 season

Working on the brooklyn_nets in the 2004 season

Working on the brooklyn_nets in the 2005 season

Working on the chicago_bulls in the 2001 season

Working on the chicago_bulls in the 2002 season

Working on the chicago_bulls in the 2003 season

Working on the chicago_bulls in the 2004 season

Working on the chicago_bulls in the 2005 season

Working on the 

In [30]:
salary_df_updated.Year.unique()

array([1991., 1992., 1993., 1994., 1995., 1996., 1997., 1998., 1999.,
       2000., 2001., 2002., 2003., 2004., 2005.])

In [31]:
salary_df = pd.DataFrame()
n = 5
years = [str(2006 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

salary_df_updated = pd.concat([salary_df_updated,salary_df])

Working on the boston_celtics in the 2006 season

Working on the boston_celtics in the 2007 season

Working on the boston_celtics in the 2008 season

Working on the boston_celtics in the 2009 season

Working on the boston_celtics in the 2010 season

Working on the atlanta_hawks in the 2006 season

Working on the atlanta_hawks in the 2007 season

Working on the atlanta_hawks in the 2008 season

Working on the atlanta_hawks in the 2009 season

Working on the atlanta_hawks in the 2010 season

Working on the brooklyn_nets in the 2006 season

Working on the brooklyn_nets in the 2007 season

Working on the brooklyn_nets in the 2008 season

Working on the brooklyn_nets in the 2009 season

Working on the brooklyn_nets in the 2010 season

Working on the chicago_bulls in the 2006 season

Working on the chicago_bulls in the 2007 season

Working on the chicago_bulls in the 2008 season

Working on the chicago_bulls in the 2009 season

Working on the chicago_bulls in the 2010 season

Working on the 

In [32]:
salary_df_updated.Year.unique()

array([1991., 1992., 1993., 1994., 1995., 1996., 1997., 1998., 1999.,
       2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010.])

In [33]:
salary_df = pd.DataFrame()
n = 5
years = [str(2011 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

salary_df_updated = pd.concat([salary_df_updated,salary_df])

Working on the boston_celtics in the 2011 season

Working on the boston_celtics in the 2012 season

Working on the boston_celtics in the 2013 season

Working on the boston_celtics in the 2014 season

Working on the boston_celtics in the 2015 season

Working on the atlanta_hawks in the 2011 season

Working on the atlanta_hawks in the 2012 season

Working on the atlanta_hawks in the 2013 season

Working on the atlanta_hawks in the 2014 season

Working on the atlanta_hawks in the 2015 season

Working on the brooklyn_nets in the 2011 season

Working on the brooklyn_nets in the 2012 season

Working on the brooklyn_nets in the 2013 season

Working on the brooklyn_nets in the 2014 season

Working on the brooklyn_nets in the 2015 season

Working on the chicago_bulls in the 2011 season

Working on the chicago_bulls in the 2012 season

Working on the chicago_bulls in the 2013 season

Working on the chicago_bulls in the 2014 season

Working on the chicago_bulls in the 2015 season

Working on the 

In [34]:
salary_df_updated.Year.unique()

array([1991., 1992., 1993., 1994., 1995., 1996., 1997., 1998., 1999.,
       2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015.])

In [35]:
salary_df = pd.DataFrame()
n = 5
years = [str(2016 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

salary_df_updated = pd.concat([salary_df_updated,salary_df])

Working on the boston_celtics in the 2016 season

Working on the boston_celtics in the 2017 season

Working on the boston_celtics in the 2018 season

Working on the boston_celtics in the 2019 season

Working on the boston_celtics in the 2020 season

Working on the atlanta_hawks in the 2016 season

Working on the atlanta_hawks in the 2017 season

Working on the atlanta_hawks in the 2018 season

Working on the atlanta_hawks in the 2019 season

Working on the atlanta_hawks in the 2020 season

Working on the brooklyn_nets in the 2016 season

Working on the brooklyn_nets in the 2017 season

Working on the brooklyn_nets in the 2018 season

Working on the brooklyn_nets in the 2019 season

Working on the brooklyn_nets in the 2020 season

Working on the chicago_bulls in the 2016 season

Working on the chicago_bulls in the 2017 season

Working on the chicago_bulls in the 2018 season

Working on the chicago_bulls in the 2019 season

Working on the chicago_bulls in the 2020 season

Working on the 

In [36]:
salary_df_updated.Year.unique()

array([1991., 1992., 1993., 1994., 1995., 1996., 1997., 1998., 1999.,
       2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016., 2017.,
       2018., 2019., 2020.])

In [37]:
salary_df = pd.DataFrame()
n = 4
years = [str(2021 + i) for i in range(0,n)]

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        sleep(.5)

        salary_df = pd.concat([salary_df,df])

salary_df_updated = pd.concat([salary_df_updated,salary_df])

Working on the boston_celtics in the 2021 season

Working on the boston_celtics in the 2022 season

Working on the boston_celtics in the 2023 season

Working on the boston_celtics in the 2024 season

Working on the atlanta_hawks in the 2021 season

Working on the atlanta_hawks in the 2022 season

Working on the atlanta_hawks in the 2023 season

Working on the atlanta_hawks in the 2024 season

Working on the brooklyn_nets in the 2021 season

Working on the brooklyn_nets in the 2022 season

Working on the brooklyn_nets in the 2023 season

Working on the brooklyn_nets in the 2024 season

Working on the chicago_bulls in the 2021 season

Working on the chicago_bulls in the 2022 season

Working on the chicago_bulls in the 2023 season

Working on the chicago_bulls in the 2024 season

Working on the charlotte_hornets in the 2021 season

Working on the charlotte_hornets in the 2022 season

Working on the charlotte_hornets in the 2023 season

Working on the charlotte_hornets in the 2024 season



In [38]:
salary_df_updated.Year.unique()

array([1991., 1992., 1993., 1994., 1995., 1996., 1997., 1998., 1999.,
       2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016., 2017.,
       2018., 2019., 2020., 2021., 2022., 2023.])

In [39]:
salary_df_updated.to_csv('player_salaries.csv', index = False)