# Scraping hoophype.com to gather player salary data

In this notebook, we utilize beautiful soup to gather player salary data, including the percentage the players contract is relative to the overall team salary. 

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep

import requests

In [2]:
Teams = {'BOS':'Boston Celtics','ATL': 'Atlanta Hawks','BRK':'Brooklyn Nets','CHI': 'Chicago Bulls',
        'CHO':'Charlotte Hornets','CLE':'Cleveland Cavaliers','DAL':'Dallas Mavericks','DEN':'Denver Nuggets',
        'DET':'Detroit Pistons','GSW':'Golden State Warriors','HOU':'Houston Rockets','IND':'Indiana Pacers',
        'LAC':'Los Angeles Clippers','LAL':'Los Angeles Lakers','MEM':'Memphis Grizzlies','MIA':'Miami Heat',
        'MIL':'Milwaukee Bucks','MIN':'Minnesota Timberwolves','NOP':'New Orleans Pelicans','NYK':'New York Knicks',
        'OKC':'Oklahoma City Thunder','ORL':'Orlando Magic','PHI':'Philadelphia 76ers','PHO':'Phoenix Suns',
        'POR':'Portland Trail Blazers','SAC':'Sacramento Kings','SAS':'San Antonio Spurs','TOR':'Toronto Raptors',
        'UTA':'Utah Jazz','WAS':'Washington Wizards'}

In [3]:
salary_df = pd.DataFrame()
years = ['2022','2023']

for key in Teams.keys():
    for year in years:
        year_list = [int(year),int(year)+1]
        team_name = Teams[key]
        team_name = team_name.lower()
        team_name = team_name.replace(" ", "_")

        print('Working on the ', Teams[key])
        
        url = "https://hoopshype.com/salaries/"+ str(team_name)+ "/"+ str(year_list[0]) + "-" + str(year_list[1])+ "/"

        response = requests.get(url= url)
            
        if response.status_code == 200:
            print('Success')
        else:
            print('Failed to load webpage')

        ## create the soup object, parsing the html code
        soup = BeautifulSoup(response.text, 'html.parser')

        # data = soup.find_all(id = 'content-container') 
        ## this gets me in the right spot. I now need to pull relevent information
        data = soup.find_all("div", class_= "payroll-team")[0] #this returns a list since it 
        # finds all of them, so [0] on the end for the first instance (can use .find())

        # in this first line, I go to the <td -- > tag, which houses 
        # the salaries of the team. data_2 will be a list of times it find the tag <td>
        # in the html code.

        data_2 = data.find_all('td', attrs={'class':"name"})

        # create a list for names
        name_list = list()

        for i in range(1,len(data_2)-1):
            # data_2[i] grabs the ith item from the list 
            # data_2[i].a grabs the <a> tag from that <td> parent
            # data_2[i].a.contents grabs the text contents from that <a> tag in a list
            # data_2[i].a.contents[0].strip() takes the first element of the list
            # and strips the \n text, leaving only the player name.
            try:
                name = data_2[i].a.contents[0].strip() 
                name_list.append(name)
            except:
                name = data_2[i].contents[0].strip()
                name_list.append(name)
        num_names = len(name_list)

        data_3 = data.find_all('td', style="color:black", class_ = "")
        # <td style="color:black" class="hh-salaries-sorted" data-value="30351780">
        # 					$30,351,780				</td>

        salaries = list()
        for value in data_3:
            try:
                salaries.append(value['data-value'])
            except:
                continue

        ## we want even indices in the salaries list since the website
        ## lists the salaries as year, and adjusted for inflation. 
        t_s = 2*len(name_list)
        even_indices = [i for i in range(0,t_s) if i%2 == 0]
        salaries_updated = [float(salaries[i]) for i in even_indices]
        total_salary = sum(salaries_updated)
        percent_salary = [ float(salaries_updated[i] / total_salary) for i in range(num_names)]
        ## the salaries are paired, so we can just have it pull every other salary!

        year = [float(year) for i in range(num_names)]
        team = [key for i in range(num_names)]
        df = pd.DataFrame({'Year':year,'Team':team,'Player':name_list,'Salary':salaries_updated,
                            'Percent_team_salary':percent_salary})
        
        salary_df = pd.concat([salary_df,df])

        sleep(1)

Working on the  Boston Celtics
Success
Working on the  Boston Celtics
Success
Working on the  Atlanta Hawks
Success
Working on the  Atlanta Hawks
Success
Working on the  Brooklyn Nets
Success
Working on the  Brooklyn Nets
Success
Working on the  Chicago Bulls
Success
Working on the  Chicago Bulls
Success
Working on the  Charlotte Hornets
Success
Working on the  Charlotte Hornets
Success
Working on the  Cleveland Cavaliers
Success
Working on the  Cleveland Cavaliers
Success
Working on the  Dallas Mavericks
Success
Working on the  Dallas Mavericks
Success
Working on the  Denver Nuggets
Success
Working on the  Denver Nuggets
Success
Working on the  Detroit Pistons
Success
Working on the  Detroit Pistons
Success
Working on the  Golden State Warriors
Success
Working on the  Golden State Warriors
Success
Working on the  Houston Rockets
Success
Working on the  Houston Rockets
Success
Working on the  Indiana Pacers
Success
Working on the  Indiana Pacers
Success
Working on the  Los Angeles Clip

In [4]:
salary_df.sample(5)

Unnamed: 0,Year,Team,Player,Salary,Percent_team_salary
13,2022.0,CHI,Marko Simonovic,1563518.0,0.010289
0,2023.0,MIL,Giannis Antetokounmpo,45640084.0,0.243613
15,2023.0,GSW,Jerome Robinson,559782.0,0.002674
16,2023.0,MIL,Lindell Wigginton,244502.0,0.001305
5,2023.0,LAL,Gabe Vincent,10500000.0,0.061809


In [5]:
salary_df.to_csv('player_salaries.csv', index = False)

OSError: [Errno 30] Read-only file system: 'player_salaries.csv'