# How is NBA Salary linked to Player Performance?

### The goals of this project:

#### 1. Explore and analyze how NBA players' salaries are linked to performance. 
#### 2. Players who are overpaid/underpaid by position
#### 3. Driver better decisions to identify quality or above-average players at a low-cost or below-average pay (in per minute terms)

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

### Scraping Player Season Totals from 2000 to 2022 from Basketball Reference

#### https://www.basketball-reference.com/leagues/NBA_2020_totals.html

In [2]:
#Scraping from Basketball Reference to obtain player season totals

years = list(range(2000, 2022))

url_totals = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"


#For loop to request all years from 2000 to 2022 and store as html 
for year in years:
    url = url_totals.format(year)
    data = requests.get(url)
    
    with open("totals/{}.html".format(year), "w+", encoding = "utf-8") as f:
        f.write(data.text)

In [3]:
# Read in HTML

with open("totals/2000.html", encoding ="utf-8") as f:
    page = f.read()

In [4]:
# Parse pages with BeautifulSoup

soup = BeautifulSoup(page, "html.parser")


In [5]:
# Find the id for the stats table
totals_2000_season = soup.find(id= "all_totals_stats")

In [6]:
# Read in the html to verify
totals_2000_szn = pd.read_html(str(totals_2000_season))

totals_2000_szn[0]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,56,1578,274,646,...,.756,101,190,291,98,59,28,106,147,697
1,1,Tariq Abdul-Wahad,SG,25,ORL,46,46,1205,223,515,...,.762,77,162,239,72,53,16,87,116,563
2,1,Tariq Abdul-Wahad,SG,25,DEN,15,10,373,51,131,...,.738,24,28,52,26,6,12,19,31,134
3,2,Shareef Abdur-Rahim,SF,23,VAN,82,82,3223,594,1277,...,.809,218,607,825,271,89,87,249,244,1663
4,3,Cory Alexander,PG,26,DEN,29,2,329,28,98,...,.773,8,34,42,58,24,2,28,39,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,436,Haywoode Workman,PG,34,MIL,23,1,248,23,62,...,.692,1,16,17,44,11,0,14,23,66
513,436,Haywoode Workman,PG,34,TOR,13,1,102,8,28,...,.500,0,9,9,17,9,0,4,14,20
514,437,Metta World Peace,SF,20,CHI,72,63,2238,309,759,...,.674,62,246,308,202,119,39,166,159,866
515,438,Lorenzen Wright,C,24,ATL,75,0,1205,180,361,...,.644,117,188,305,21,29,40,66,203,448


In [None]:
# For loop to combine all total htmls into a list of dataframes

szn_ttl_00_22 = []

for year in years:
    with open("totals/{}.html".format(year), encoding = "utf-8") as f:
        page = f.read()
        soup = BeautifulSoup(page, "html.parser")
        total_table = soup.find(id = "all_totals_stats")
        total = pd.read_html(str(total_table))[0]
        total["Year"] = year
        
        szn_ttl_00_22.append(total)

In [None]:
# # Using selenium to parse JS Pages
# from selenium import webdriver
# import time

# driver = webdriver.Chrome("/Users/alexc/Desktop/NBA Scrape Project/NBA_Salaries_Performance/NBA_Salaries_Performance/chromedriver")

In [None]:
# player_szn_stats = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"

# for year in years:
#     url = player_szn_stats.format(year)

#     driver.get(url)
#     driver.execute_script("window.scrollTo(1,10000)")
#     time.sleep(2)

#     html = driver.page_source
    
#     with open("Totals/{}.html".format(year), "w+", encoding = 'utf-8') as f:
#         f.write(html)

In [None]:
# # For loop to combine all the htmls into a list of dfs

# player_szn_totals = []

# for year in years:
#     with open("Totals/{}.html".format(year), encoding= "utf-8") as f:
#         page = f.read()
#     soup = BeautifulSoup(page, "html.parser")
#     total_table = soup.find(id= "all_totals_stats")
#     total_stats = pd.read_html(str(total_table))[0]
#     total_stats["Year"] = year
    
#     player_szn_totals.append(total_stats)

In [None]:
# player_szn_totals_df = pd.concat(player_szn_totals)
# player_szn_totals_df.to_csv("./Data Files/player_szn_totals.csv")

In [None]:
# player_szn_totals_df.shape

In [None]:
szn_ttl_df = pd.concat(szn_ttl_00_22)
szn_ttl_df.shape

In [None]:
szn_ttl_df.to_csv("./Data Files/szn_ttl_df.csv")

In [None]:
#Excluding rows where Rk is Rk

szn_ttl_df_cleaned = szn_ttl_df[szn_ttl_df["Rk"] != "Rk"]

In [None]:
szn_ttl_df_cleaned.to_csv("./Data Files/szn_ttl_df_cleaned.csv")

In [None]:
# Some players have an asterick next to their names, we need to remove this
# so we can merge with salary dataset

szn_ttl_df_cleaned["Player"] = szn_ttl_df_cleaned["Player"].str.replace("*", "", regex = False)

szn_ttl_df_cleaned.head(20)

In [None]:
szn_ttl_df_cleaned.groupby(["Player", "Year"]).get_group(("A.J. Price", 2015))


In [None]:
# def get_current_team(df):
#     # check if the dataframe has more than one row
#     if df.shape[0] > 1:
#         # get the row with the "TOT" team value
#         tot_row = df[df['Tm'] == 'TOT']
#         tot_row == df.iloc[-1,:]["Tm"]
#         # if there is a "TOT" row, return it
#         if tot_row.shape[0] == 0:
#             df = df.sort_values(by='Year', ascending=False).iloc[0]
#     # if the dataframe has only one row, return it
#     else:
#         return df
# szn_ttl_df_cleaned = szn_ttl_df_cleaned.groupby(["Player", "Year"]).apply(get_current_team)

In [None]:
'''This function take in a single dataframe and returns the record if there is only one row. 
If there are multiple rows for a player, it will take the total or TOT and replace the Team(Tm) with the 
most current team.
'''

def single_record(df):
    if df.shape[0] == 1:
        return df
    else:
        row = df[df["Tm"] == "TOT"]
        row["Tm"] = df.iloc[1,:]["Tm"]
        return row

szn_ttl_df_cleaned = szn_ttl_df_cleaned.groupby(["Player", "Year"]).apply(single_record)

In [None]:
szn_ttl_df_cleaned.head(20)

In [None]:
szn_ttl_df_cleaned.index = szn_ttl_df_cleaned.index.droplevel()

In [None]:
szn_ttl_df_cleaned.index = szn_ttl_df_cleaned.index.droplevel()

In [None]:
szn_ttl_df_cleaned[szn_ttl_df_cleaned["Player"] == "A.J. Price"]