# Notebook 4:  Preparing Data for Web App

In this final notebook we prepare the 2021-22 NBA player data (together with our model's market value predictions) for easy ingestion into the python script for our web app.
1.  **Scrape Missing Data:** We scrape some additional current-season player data (height, weight, and missing salaries) from www.basketball-reference.com 
2.  **Final Touches:** We calculate each player's surplus value (difference between market value and salary) and couple of other minor things.

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json, requests
from bs4 import BeautifulSoup, Comment

#Import data
df2021 = pd.read_csv('./data/model_currentyear.csv')

# (1) Scrape Missing Data

### Get player heights and weights for dataframe

In [2]:
def scrape_height_weight(playerurl):
    '''Scrape player page on basketball-reference.com.  
       Returns height, weight tuple
    '''
    playerresponse = requests.get(playerurl)
    playerpage = playerresponse.text
    playersoup = BeautifulSoup(playerpage, "lxml")

    ## Get Height and Weight:
    
    try:
        script_text = playersoup.find('script',{'type': 'application/ld+json'}).getText()
        biodata = json.loads(script_text) # a dictionary!
        weight = biodata['weight']['value'].replace('lbs','').strip()
        height = biodata['height']['value']
    except:
        weight = np.nan
        height = np.nan
        
    return height, weight

for index, row in df2021.iterrows():
    url = "https://www.basketball-reference.com" + df2021.loc[index, 'ID']
    height, weight = scrape_height_weight(url)
    df2021.loc[index, 'Height'] = height
    df2021.loc[index, 'Weight'] = weight

### Fill in missing salary values (this affects mostly minimum salary guys)

In [3]:
def scrape_salary(playerurl, year):
    '''Scrape player page on basketball-reference.com. Note that this is a player's total compensation: 
       if he was paid by multiple teams this year, the salaries are concatenated (eg: "$27,957,238$794,536")
    '''
    
    d = {}
    season = str(year) + '-' + str(year-2000+1)
    
    playerresponse = requests.get(playerurl)
    playerpage = playerresponse.text
    playersoup = BeautifulSoup(playerpage, "lxml")
    
    ## Get salary
    #  — If the player was on one team, it will be a string that looks like "$27,093,019"
    #  — If the player was on two teams, it will be a string that looks like "$27,957,238$794,536"
    
    placeholder = playersoup.select_one('#all_all_salaries .placeholder')
    comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
    table = BeautifulSoup(comment, 'lxml') 

    dfsalaries = pd.read_html(str(table.find(id="all_salaries")))[0]
    
    try: 
        salary = dfsalaries[dfsalaries['Season'] == season]['Salary'].sum()
    except: 
        salary = np.nan
    
    return salary

def clean_salary(sal, year):
    ''' Adds all salaries earned per year into one salary (in millions of dollars)
        "< $Minimum" is treated as $0
    '''
    newlist = [0]
    for elem in sal.split('$'):
        try:
            elem = elem.replace('<','').replace(',','').replace('(TW)','').replace('Minimum','0')
            elem =  float(elem)
            newlist.append(elem)
        except:
            continue
            
    salary = sum(newlist)/1000000    
    return salary

for index, row in df2021.iterrows():
    url = "https://www.basketball-reference.com" + df2021.loc[index, 'ID']
    salary = scrape_salary(url, 2021)
    df2021.loc[index, 'Salary'] = salary
    salary_clean = clean_salary(salary, 2021)
    df2021.loc[index, 'SalClean'] = salary_clean  
    
def sal_bucket(salary):
    if salary < 5: 
        return 0
    elif salary < 10:
        return 5
    elif salary < 15:
        return 10
    elif salary < 20:
        return 15
    elif salary < 25:
        return 20
    elif salary < 30:
        return 25
    elif salary < 50:
        return 30
    
df2021['Sal_class'] = df2021['SalClean'].apply(sal_bucket)

#Drop unnecessary columns
df2021 = df2021.drop(['Unnamed: 0','NameYear', 'TeamOrTot', 'Salary', 'SalNorm'], axis=1) 
print(df2021.columns)

Index(['Name', 'Pos', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '3PAr', 'AST%', 'BLK%',
       'BPM', 'DBPM', 'DRB%', 'DWS', 'FTr', 'OBPM', 'ORB%', 'OWS', 'PER',
       'STL%', 'TOV%', 'TRB%', 'TS%', 'USG%', 'VORP', 'WS', 'WS/48', 'ID',
       'Team', 'SalClean', 'FG%_', 'FT%_', 'TS%_', '3P%_', 'G_pct', 'GS_pct',
       'GS/G', 'Sal_class', 'Sal_class_predict', 'Sal_class_predict_proba',
       'Height', 'Weight'],
      dtype='object')


# (2) Final Touches

### Calculate surplus value for each player

In [4]:
def surplus_value(salary, predicted_sal_class):
    if salary < 2:     #Set the minimum at $2M
        salary = 2
        
    if predicted_sal_class == 30:
        if salary > 30:
            return 0
        else:
            return 30 - salary
    else:
        min_range = predicted_sal_class
        max_range = predicted_sal_class + 5
        if min_range <= salary <= max_range:
            return 0
        elif salary < min_range:
            return min_range - salary
        elif salary > max_range:
            return max_range - salary
        
for index, row in df2021.iterrows(): 
    salary = df2021.loc[index, 'SalClean'] 
    predicted_sal_class = df2021.loc[index, 'Sal_class_predict']    
    df2021.loc[index, 'Surplus_value'] = surplus_value(salary, predicted_sal_class)
    
df2021['SalFinal'] = df2021['SalClean']
# df2021 = df2021.round({'SalFinal': 1, 'Surplus_value': 1})
# df2021.loc[df2021.SalFinal < 2, 'SalFinal'] = '<2'

cols = ['Name','Surplus_value','SalFinal','Sal_class_predict']
df2021[cols].sort_values('Surplus_value')[-10:]

Unnamed: 0,Name,Surplus_value,SalFinal,Sal_class_predict
32,RJ Barrett,21.37608,8.62392,30
601,Trae Young,21.673529,8.326471,30
24,LaMelo Ball,21.76824,8.23176,30
183,Darius Garland,22.95912,7.04088,30
27,Desmond Bane,22.96684,2.03316,25
65,Mikal Bridges,24.442275,5.557725,30
191,Shai Gilgeous-Alexander,24.504468,5.495532,30
66,Miles Bridges,24.578507,5.421493,30
213,Tyrese Haliburton,25.9764,4.0236,30
357,Tyrese Maxey,27.39708,2.60292,30


### Calculate net surplus value by team

In [5]:
tm_to_team =  {
 'TOR': 'Toronto Raptors',         'MEM': 'Memphis Grizzlies',
 'MIA': 'Miami Heat',              'BRK': 'Brooklyn Nets',
 'NOP': 'New Orleans Pelicans',    'MIL': 'Milwaukee Bucks',
 'CLE': 'Cleveland Cavaliers' ,    'LAL': 'Los Angeles Lakers',
 'ORL': 'Orlando Magic',           'HOU': 'Houston Rockets' ,
 'WAS': 'Washington Wizards' ,     'PHO': 'Phoenix Suns',
 'UTA': 'Utah Jazz',               'SAC': 'Sacramento Kings',
 'CHO': 'Charlotte Hornets',       'CHI': 'Chicago Bulls' ,
 'NYK': 'New York Knicks',         'DEN': 'Denver Nuggets' ,
 'PHI': 'Philadephia 76ers' ,      'SAS': 'San Antonio Spurs' ,
 'LAC': 'Los Angeles Clippers',    'OKC': 'Oklahoma City Thunder' ,
 'MIN': 'Minnesota Timberwolves',  'DET': 'Detroit Pistons' ,
 'IND': 'Indiana Pacers',          'GSW': 'Golden State Warriors' ,
 'POR': 'Portland Trailblazers',   'ATL': 'Atlanta Hawks',
 'BOS': 'Boston Celtics',          'DAL':'Dallas Mavericks',
 }

# team_to_tm =  {v: k for k, v in tm_to_team.items()}
# tm_surplusvalue = df2021.groupby('Team')['Surplus_value'].sum().sort_index().to_dict()
dfteams = pd.DataFrame(df2021.groupby('Team')['Surplus_value'].sum())
dfteams = dfteams.reset_index() 
dfteams['Team_Name'] = dfteams['Team'].apply(lambda x: tm_to_team[x])
dfteams

Unnamed: 0,Team,Surplus_value,Team_Name
0,ATL,32.69403,Atlanta Hawks
1,BOS,33.666681,Boston Celtics
2,BRK,-14.098715,Brooklyn Nets
3,CHI,12.192863,Chicago Bulls
4,CHO,67.442964,Charlotte Hornets
5,CLE,47.364404,Cleveland Cavaliers
6,DAL,32.152668,Dallas Mavericks
7,DEN,16.158103,Denver Nuggets
8,DET,22.258564,Detroit Pistons
9,GSW,-18.604263,Golden State Warriors


### Odds and Ends

In [6]:
# Get the prediction probability for maximum class (to be used for player similarity ranking)
def get_max(probas):
    probas = probas.strip('[]').split(',')
    probas = [float(elem) for elem in probas]
    return max(probas)

df2021['Max_proba'] = df2021['Sal_class_predict_proba'].apply(get_max)

# Keep primary playing position for easier bookkeeping (only affects 18 players)
df2021['Pos'] = df2021['Pos'].apply(lambda x: x.split('-')[0])

# Save final data for web app

In [7]:
df2021.to_csv(r'/Users/andrei/Dropbox/Metis/HoopsHero/data/app_dfplayers.csv')
dfteams.to_csv(r'/Users/andrei/Dropbox/Metis/HoopsHero/data/app_dfteams.csv')