In [431]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import datetime as dt

seasons_stats={}

for year in range(1980,2021):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_totals.html'.format(year)
    r = requests.get(url)
    r_html = r.text
    soup = BeautifulSoup(r_html,'html.parser')
    table=soup.find_all(class_="full_table")
    ##Extracting List of column names
    head=soup.find(class_="thead")
    column_names_raw=[head.text for item in head][0] 
    column_names_polished=column_names_raw.replace("\n",",").split(",")[2:-1] 
    
    
    ##Extracting full list of player_data
    players=[]
    
    for i in range(len(table)): 
        player_=[]

        for td in table[i].find_all("td"):
            player_.append(td.text)
        players.append(player_)
            
    df=pd.DataFrame(players, columns=column_names_polished).set_index("Player") 
    
    #cleaning the player's name from occasional special characters
    df.index=df.index.str.replace('*', '')
    #For later calculations players that bounced around between teams during the season will not qualify
    df = df[df.Tm != "TOT"]
    seasons_stats[year-1]=df
    

  df.index=df.index.str.replace('*', '')


We have collected the data from all seasons 1980-2019 and organized it into a dictionary separated by season. Using this data we will be able to perform analysis from different approaches to determine the Greatest of All Time (GOAT) basketball player from different lenses

Although, many players throughout the NBA's history may have played for several years, or even decades. There have been countless stars who have had their career's cut short due to unfortunate injuries. In order to take these playerss into consideration, I will use the data collected above separated by NBA season to identify the GOAT season by a single player.

To compare the caliber of a player's specific season to another, one useful metric is a player's Player Efficiency Rating (PER). This metric takes into accounts a player's offensive and defensive stats and sums up positive accomplishments while subtracting negative accomplishments, returning a per-minute rating of a player's performance. More information on PER can be found here https://www.espn.com/nba/columns/story?columnist=hollinger_john&id=2850240 . 

The pace of a team is taken into account of PER so that players on teams with a pace slower than league average are not punished unfairly. As a result, we need to calculate a pace adjustment for each team. 
(Pace_adjustment = league_pace / team_pace)

In [432]:
#dictionary that will hold team stats
team_stats_per_year = {}

for year in range(1980,2021):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}.html'.format(year)
    r = requests.get(url)
    r_html = r.text
    soup = BeautifulSoup(r_html,'html.parser')

    #Narrows down to specific table on page
    totals_table=soup.find('table', id='totals-team')
    head = totals_table.find("thead")

    #Specific row has no class attribute
    labels = head.find("tr", attrs={'class': None})

    #will be column names for dataframe
    column_names_ = ([labels.text for item in head][0]).split()
    # dont neeed rank columns
    column_names_.pop(0)
    #contins data for dataframes
    tbody = totals_table.find('tbody')
    rows = tbody.find_all("tr")

    teams = []
    team_paces = []
    # Grabs all data from each team row by row
    for i in range(len(rows)):
        stat = []
        for td in rows[i].find_all("td"):
            stat.append(td.text)
        teams.append(stat)
    
    # Gets league averages for that year and adds them to the data set
    #average = totals_table.find("tfoot")
    #avg_row = average.find("tr")
    #avg_stats = avg_row.find([avg_row.text for item in average][0]).split()
    #teams.append(avg_stats)
    
    #Add team stats to dataframe for the year (index off team)
    df = pd.DataFrame(teams, columns=column_names_).set_index("Team")
    df.index=df.index.str.replace('*', '')
    
    # Reading in advanced table from same URL to get each team's pace
    advanced_table = soup.find('table', id='advanced-team')
    columns_pace = ["Team", "Pace"]
    tbody = advanced_table.find('tbody')
    rows = tbody.find_all("tr")
    
    for i in range(len(rows)):
        team_pace = []
        for td in rows[i].find_all("td"):
            #We only need the pace statistic and corresponding team
            if ((td['data-stat']) == "team" or (td['data-stat']) == "pace"):
                team_pace.append(td.text)
        team_paces.append(team_pace)
    
    pace_frame = pd.DataFrame(team_paces, columns = columns_pace).set_index("Team")
    pace_frame.index=df.index.str.replace('*', '')
    
    #merge the two dataframes
    result = pd.merge(df, pace_frame, left_index= True, right_index = True)
    
    #Add to dictionary of team stats by year
    team_stats_per_year[year-1] = result

  df.index=df.index.str.replace('*', '')
  pace_frame.index=df.index.str.replace('*', '')


The code above extracts all of the team and league data from the 1979-1980 season through the 2019-2020 season necessary to calculate the PER of a player.

Creating columns in seasons stats to hold uPer, aPer and lg_aPer

In [433]:
for year in range(1979, 2020):
    (seasons_stats[year])["uPer"] = 0
    (seasons_stats[year])["aPer"] = 0
    (seasons_stats[year])["lg_aPer"] = 0

A problem with our data is that the team names are represented as three letter abbreviations for the player data but are stored as the full team name for the team data. This creates problems with accessing data of a respective player's team. We need to convert every abbrevaition into a full team name.

In [436]:
def change_name(abr, year):
    if abr == "ATL":
        return "Atlanta Hawks"
    elif abr == "BOS":
        return "Boston Celtics"
    elif abr == "BKN":
        return "Brooklyn Nets"
    elif abr == "BRK":
        return "Brooklyn Nets"
    elif abr == "CHA":
        return "Charlotte Bobcats"
    elif abr == "CHH":
        return "Charlotte Hornets"
    elif abr == "CHO":
        return "Charlotte Hornets"
    elif abr == "CHI":
        return "Chicago Bulls"
    elif abr == "CLE":
        return "Cleveland Cavaliers"
    elif abr == "DAL":
        return "Dallas Mavericks"
    elif abr == "DEN":
        return "Denver Nuggets"
    elif abr == "DET":
        return "Detroit Pistons"
    elif abr == "GSW":
        return "Golden State Warriors"
    elif abr == "HOU":
        return "Houston Rockets"
    elif abr == "IND":
        return "Indiana Pacers"
    elif abr == "KCK":
        return "Kansas City Kings"
    elif abr == "LAL":
        return "Los Angeles Lakers"
    elif abr == "LAC":
        return "Los Angeles Clippers"
    elif abr == "MEM":
        return "Memphis Grizzlies"
    elif abr == "MIA":
        return "Miami Heat"
    elif abr == "MIL":
        return "Milwaukee Bucks"
    elif abr == "MIN":
        return "Minnesota Timberwolves"
    elif abr == "NJN":
        return "New Jersey Nets"
    elif abr == "NOH":
        if (year == 2005 or year == 2006):
            # Played games in oklahoma city these two years
            return "New Orleans/Oklahoma City Hornets"
        else:
            return "New Orleans Hornets"
    elif abr == "NOP":
        return "New Orleans Pelicans"
    elif abr == "NOK":
        if (year == 2005 or year == 2006):
            # Played games in oklahoma city these two years
            return "New Orleans/Oklahoma City Hornets"
        else:
            return "New Orleans Hornets"
    elif abr == "NYK":
        return "New York Knicks"
    elif abr == "ORL":
        return "Orlando Magic"
    elif abr == "OKC":
        return "Oklahoma City Thunder"
    elif abr == "PHI":
        return "Philadelphia 76ers"
    elif abr == "PHO":
        return "Phoenix Suns"
    elif abr == "POR":
        return "Portland Trail Blazers"
    elif abr == "SAS":
        return "San Antonio Spurs"
    elif abr == "SAC":
        return "Sacramento Kings"
    elif abr == "SDC":
        return "San Diego Clippers"
    elif abr == "SEA":
        return "Seattle SuperSonics"
    elif abr == "TOR":
        return "Toronto Raptors"
    elif abr == "UTA":
        return "Utah Jazz"
    elif abr == "VAN":
        return "Vancouver Grizzlies"
    elif abr == "WAS":
        return "Washington Wizards"
    elif abr == "WSB":
        return "Washington Bullets"
    else:
        print("YOU MISSED ME MY ABR is " + abr)
        return abr
    


In [437]:
for year in range(1979, 2020):
     for player in (seasons_stats[year]).index:
        #Check is necessary because we can attempt to access an index that doesn't exist (index already been removed)
        if(player in seasons_stats[year].index):
            #checks for bad data
            if(type((seasons_stats[year]).loc[player, "Tm"]) != str):
                (seasons_stats[year]).drop(player, inplace=True)
            else:
                (seasons_stats[year]).at[player, "Tm"] = change_name((seasons_stats[year]).at[player, "Tm"], year)
        

Now we will be able to accurately gather team data for the corresponding team while iterating through players.

Another problem with our data is that everything was read in as text (strings instead of ints) for the columns that hold quantitaive stats, I will need to convert them.

In [438]:
#Converts all player data needed for later calculations into floats (from objects)
for year in range(1979, 2020):
    for column in seasons_stats[year]:
        #These columns either are meant to be left as strings, or have missing data aren't needed
        if (column != "Player" and column != "Pos" and column != "Tm" and column != "3P%"
           and column != "GS" and column != "FT%" and column != "FG%" and column != "2P%"
           and column != "eFG%"):
            (seasons_stats[year])[column] = ((seasons_stats[year])[column]).astype(float)

In [439]:
#Converts all team data needed for later calculation into floats (from objects)
for year in range(1979, 2020):
    for column in team_stats_per_year[year]:
        #These columns either are meant to be left as strings, or have missing data aren't needed
        if (column != "Team"):
            (team_stats_per_year[year])[column] = ((team_stats_per_year[year])[column]).astype(float)

The function below will calculate PER unadjusted for team pace (uPer) using Bleacher Report's linear weights


In [440]:
def determine_uPer(minsPlayed, t_points, AST, team_ast, team_fg, FG, FGA, FT, FTA, STL, ORB, TOV, TRB, BLK, 
                   PF):
    
    per_min = float(1/minsPlayed)
    #factor = float(2/3) - ((0.5 * float(lg_AST / lg_FG)) / (2 * (float(lg_FG / lg_FT))))
    #VOP = float(lg_PTS/ (lg_FGA - lg_ORB + lg_TOV + (0.44 * lg_FTA)))
    #DRB = float((lg_TRB - lg_ORB) / lg_TRB)

    result = per_min * (
    (FG * 85.910)
    + (STL * 53.897)
    + (t_points * 51.757)
    + (FT * 46.845)
    + (BLK * 39.190)
    + (ORB * 39.190)
    + (AST * 34.677)
    + ((TRB-ORB) * 14.707)
    - (PF * 17.174)
    - ((FTA-FT) * 20.091)
    - ((FG-FGA) * 39.190)
    - (TOV * 53.897)
    )
    return result    

The function above calculates an individual player's uPER

In [441]:
def determine_aPer(uPer, t_pace, lg_pace):
    adjustment = float(lg_pace / t_pace)
    return (adjustment * uPer)

The function above adjusted the individuals PER to account for the pace of their team compared to the leahue average.

In [442]:
def standardize_Per(aPer, lg_aPer):
    return (aPer * float(15/lg_aPer))

The function above standardizes adjusted PER (sets league average to 15)

Because PER takes into account the per-minute performance of a player, it is not an appropriate metric if the minutes played by a player is not a large enough sample size. To prevent players with low minutes played from skewing our standardized scores we will drop off any players that played under 200 minutes.

In [443]:
for year in range(1979, 2020):
    for player in seasons_stats[year].itertuples():
        if (float(player.MP) < 200):
            if (player.Index in (seasons_stats[year]).index):
                (seasons_stats[year]).drop(player.Index, inplace= True)

In [445]:
for year in range(1979, 2020):
    #we will need to keep track of all player's per's to make league averages
    (seasons_stats[year])["uPer"] = 0
    (seasons_stats[year])["aPer"] = 0
    (seasons_stats[year])["lg_aPer"] = 0
    (seasons_stats[year])["PER"] = 0
    # grabs league data from that year
    lg = team_stats_per_year[year]
    for player in seasons_stats[year].itertuples():
        #grabs player team
        p_team = player.Tm
        uPer = determine_uPer(player.MP, player._10, player.AST, lg.loc[p_team, 'AST'], lg.loc[p_team, 'FG'], player.FG, player.FGA, player.FT, player.FTA,
                         player.STL, player.ORB, player.TOV, player.TRB, player.BLK, player.PF)
        #print(uPer)
        # add uPer to the data frame
        (seasons_stats[year]).loc[player.Index, ['uPer']] = uPer
    
        aPer = determine_aPer(uPer, lg.loc[p_team, 'Pace'], lg['Pace'].mean())
        #add aPer to data frame (we will need to calculate mean later)
        (seasons_stats[year]).loc[player.Index, ['aPer']] = aPer
    for player in seasons_stats[year].itertuples():
        # Final PER
        PER = standardize_Per(player.aPer, (seasons_stats[year])['aPer'].mean())
        (seasons_stats[year]).loc[player.Index, ['PER']] = PER

The code above first calls the determine_uPer function to calculate an unadjusted PER rating for reach player and then adjusts the player's uPER to aPer accounting for their teams pace when compared to the league average. Finally when every player's aPer is calculated for a season, the standardized PER can be found (keeping in mind that the league average should be 15).

To find the players with the most dominant season, we will gather the players that had PER scores over 28.0

In [465]:
top_players = pd.DataFrame(columns = ['Name', 'Year', 'PER'])

In [466]:
for year in range (1979, 2020):
    for player in seasons_stats[year].itertuples():
        if player[32] > 28.0:
            new_row = {'Name': player.Index, 'Year': year, 'PER': player[32]}
            top_players = top_players.append(new_row, ignore_index = True)

In [470]:
top_players.sort_values(by=['PER'], ascending= False)

Unnamed: 0,Name,Year,PER
19,Russell Westbrook,2016,32.169604
12,Kobe Bryant,2005,31.795259
17,Russell Westbrook,2014,30.973235
11,Tracy McGrady,2002,30.692457
6,Michael Jordan,1990,30.248882
18,Stephen Curry,2015,30.215062
21,James Harden,2018,29.811516
1,Michael Jordan,1985,29.770069
2,Michael Jordan,1986,29.684472
15,LeBron James,2008,29.481304


The 23 most dominant single seasons (according to our PER calculations) are listed above. Russel Westbrook's 2016-2017 season scored the highest on our scale than any other season. Although you can see that players such as Lebron James and Michael Jordan have cracked the list far more times than Westbrook, suggesting they both racked up consistent dominant seasons. However, according to our metrics Westbrook's season still lands him the spot of GOAT season.