# Scraping hoophype.com to gather player salary data

In this notebook, we utilize beautiful soup to gather player salary data, including the percentage the players contract is relative to the overall team salary. 

In [5]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep

import requests

In [6]:
Teams = {'BOS':'Boston Celtics','ATL': 'Atlanta Hawks','BRK':'Brooklyn Nets','CHI': 'Chicago Bulls',
        'CHO':'Charlotte Hornets','CLE':'Cleveland Cavaliers','DAL':'Dallas Mavericks','DEN':'Denver Nuggets',
        'DET':'Detroit Pistons','GSW':'Golden State Warriors','HOU':'Houston Rockets','IND':'Indiana Pacers',
        'LAC':'Los Angeles Clippers','LAL':'Los Angeles Lakers','MEM':'Memphis Grizzlies','MIA':'Miami Heat',
        'MIL':'Milwaukee Bucks','MIN':'Minnesota Timberwolves','NOP':'New Orleans Pelicans','NYK':'New York Knicks',
        'OKC':'Oklahoma City Thunder','ORL':'Orlando Magic','PHI':'Philadelphia 76ers','PHO':'Phoenix Suns',
        'POR':'Portland Trail Blazers','SAC':'Sacramento Kings','SAS':'San Antonio Spurs','TOR':'Toronto Raptors',
        'UTA':'Utah Jazz','WAS':'Washington Wizards'}

In [9]:
def team_name_edit(x):
    x = x.lower()
    x = x.replace(" ", "_")
    return(x) 

def util_func(x):
    try: return(float(x))
    except: pass

    
def scrape_salaries(team_name,year,key):

    sleep(1)

    print('Working on the ' + str(team_name) + ' in the ' + str(year) + ' season')
    year_list = [int(year),int(year)+1]
        
    url = "https://hoopshype.com/salaries/"+ str(team_name)+ "/"+ str(year_list[0]) + "-" + str(year_list[1])+ "/"

    response = requests.get(url= url)
            
    if response.status_code == 200:
        print('')
    else:
        print('Failed to load webpage')

    ## create the soup object, parsing the html code
    soup = BeautifulSoup(response.text, 'html.parser')

    # data = soup.find_all(id = 'content-container') 
    ## this gets me in the right spot. I now need to pull relevent information
    data = soup.find_all("div", class_= "payroll-team")[0] #this returns a list since it 
    # finds all of them, so [0] on the end for the first instance (can use .find())

    # in this first line, I go to the <td -- > tag, which houses 
    # the salaries of the team. data_2 will be a list of times it find the tag <td>
    # in the html code.

    data_2 = data.find_all('td', attrs={'class':"name"})

    # create a list for names
    name_list = list()

    for i in range(1,len(data_2)-1):
        # data_2[i] grabs the ith item from the list 
        # data_2[i].a grabs the <a> tag from that <td> parent
        # data_2[i].a.contents grabs the text contents from that <a> tag in a list
        # data_2[i].a.contents[0].strip() takes the first element of the list
        # and strips the \n text, leaving only the player name.
        try:
            name = data_2[i].a.contents[0].strip() 
            name_list.append(name)
        except:
            name = data_2[i].contents[0].strip() ## some names aren't in an <a> tag.
            if name != '':
                name_list.append(name) 
    num_names = len(name_list)

    data_3 = data.find_all('td', style="color:black", class_ = "")
    # <td style="color:black" class="hh-salaries-sorted" data-value="30351780">
    # 					$30,351,780				</td>

    salaries = list()
    for value in data_3:
        try:
            salaries.append(value['data-value'])
        except:
            continue

    ## we want even indices in the salaries list since the website
    ## lists the salaries as year, and adjusted for inflation. 
    t_s = 2*len(name_list)
    even_indices = [i for i in range(0,t_s) if i%2 == 0]
    salaries_updated = [util_func(salaries[i]) for i in even_indices]
        
    total_salary = sum(salaries_updated)
    percent_salary = [util_func(salaries_updated[i]) / total_salary for i in range(num_names)]
    ## the salaries are paired, so we can just have it pull every other salary!

    
    year = [float(year) for i in range(num_names)]
    team = [key for i in range(num_names)]

    sleep(.5)
    
    df = pd.DataFrame({'Year':year,'Team':team,'Player':name_list,'Salary':salaries_updated,
                        'Percent_team_salary':percent_salary})
    
    return(df)

In [10]:
salary_df = pd.DataFrame()
years = ['1991']

for key in Teams.keys():
    for year in years:
        team_name = Teams[key]
        team_name = team_name_edit(team_name)
        
        df = scrape_salaries(team_name,year,key)

        salary_df = pd.concat([salary_df,df])

Working on the boston_celtics in the 1991 season

Working on the atlanta_hawks in the 1991 season

Working on the brooklyn_nets in the 1991 season

Working on the chicago_bulls in the 1991 season

Working on the charlotte_hornets in the 1991 season

Working on the cleveland_cavaliers in the 1991 season

Working on the dallas_mavericks in the 1991 season

Working on the denver_nuggets in the 1991 season

Working on the detroit_pistons in the 1991 season

Working on the golden_state_warriors in the 1991 season

Working on the houston_rockets in the 1991 season

Working on the indiana_pacers in the 1991 season

Working on the los_angeles_clippers in the 1991 season

Working on the los_angeles_lakers in the 1991 season

Working on the memphis_grizzlies in the 1991 season

Working on the miami_heat in the 1991 season

Working on the milwaukee_bucks in the 1991 season

Working on the minnesota_timberwolves in the 1991 season

Working on the new_orleans_pelicans in the 1991 season

Working on

In [12]:
salary_df.sample(10)

Unnamed: 0,Year,Team,Player,Salary,Percent_team_salary
2,1991.0,LAC,Doc Rivers,1195000.0,0.092093
5,1991.0,SAC,Duane Causwell,820000.0,0.065642
10,1991.0,BRK,Rafael Addison,311000.0,0.024686
13,1991.0,NYK,Kennard Winchester,120000.0,0.009678
12,1991.0,OKC,Bart Kofoed,130000.0,0.009769
3,1991.0,MIL,Dale Ellis,1300000.0,0.104409
13,1991.0,LAL,Keith Owens,130000.0,0.008295
3,1991.0,IND,Rik Smits,1350000.0,0.089463
1,1991.0,POR,Terry Porter,1787000.0,0.140245
13,1991.0,IND,Sean Green,130000.0,0.008615


In [None]:
salary_df.to_csv('player_salaries.csv', index = False)