In [8]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
from unidecode import unidecode
# Goal is to 1st create CSV of year player signed contract to be used for Web Scraper

In [9]:
filename = "raw_data\\NBA_current_contracts.csv"
df = pd.read_csv(filename, parse_dates=True, encoding = "ISO-8859-1")

In [10]:
# Remove unneeded columns from df
columns_not_needed = ['POS', 'TEAM', 'GUARANTEED', '% GTD', 'DOLLARS','AVG. SALARY']
df = df.drop(columns_not_needed, axis='columns')
# Tidy FREE AGENT column to CONTRACT_END_YEAR
# Tidy YRS to CONTRACT_LENGTH_YEAR
# Tidy PLAYER to PLAYER_NAME
df.rename(columns={'FREE AGENT':'CONTRACT_END_YEAR',
                   'YRS':'CONTRACT_LENGTH_YEAR',
                   'PLAYER':'PLAYER_NAME'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 4 columns):
PLAYER_NAME             560 non-null object
AGE                     560 non-null int64
CONTRACT_LENGTH_YEAR    560 non-null int64
CONTRACT_END_YEAR       560 non-null int64
dtypes: int64(3), object(1)
memory usage: 17.6+ KB


In [11]:
# Clean up PLAYER string
df.PLAYER_NAME = [name[0] for name in df.PLAYER_NAME.str.split('(')]
df.PLAYER_NAME.head()

0    Russell Westbrook 
1        Stephen Curry 
2        Blake Griffin 
3         James Harden 
4            John Wall 
Name: PLAYER_NAME, dtype: object

# Start on cell below to eliminate unicode text and chance to ascii

In [12]:
# In order to web scrape basketball-reference.com we need to create the below names

# Clean up PLAYER_NAME to remove all initials and translate to ascii
df.PLAYER_NAME = df.PLAYER_NAME.str.replace('.', '')

# Remove unicode characters and change to ASCII for scraping
df.PLAYER_NAME = [unidecode(df.PLAYER_NAME[i]) for i in df.index]

# PLAYER_LAST_INITIAL representing the players last initial
df['PLAYER_LAST_INITIAL'] = [name[1][0].lower() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_LAST_NAME_FIRST_FIVE representing the first 5 letters of a player's
# last name
df['PLAYER_LAST_NAME_FIRST_FIVE'] = [name[1][0:5].lower().rstrip() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_FIRST_NAME_FIRST_TWO representing the first 2 letters of a player's
# first name
df['PLAYER_FIRST_NAME_FIRST_TWO'] = [name[0][0:2].lower() for name in df.PLAYER_NAME.str.split(' ')]
df.head()

Unnamed: 0,PLAYER_NAME,AGE,CONTRACT_LENGTH_YEAR,CONTRACT_END_YEAR,PLAYER_LAST_INITIAL,PLAYER_LAST_NAME_FIRST_FIVE,PLAYER_FIRST_NAME_FIRST_TWO
0,Russell Westbrook,28,5,2023,w,westb,ru
1,Stephen Curry,29,5,2022,c,curry,st
2,Blake Griffin,28,5,2022,g,griff,bl
3,James Harden,28,4,2023,h,harde,ja
4,John Wall,27,4,2023,w,wall,jo


In [13]:
# Calculate the year contract was signed as CONTRACT_SIGNED_YEAR
df['CONTRACT_SIGNED_YEAR'] = df.CONTRACT_END_YEAR - df.CONTRACT_LENGTH_YEAR

# All BEFORE_SIGNED_YEAR >= 2017 is set to 2017 as the data used in this
# notebook was gathered in 2017 so the year the contract signed is 2017
# This may occur as players already under contract may sign an extension
# The extension begins after the currenct contract ends which may be in the 
# future. 
df.loc[df.CONTRACT_SIGNED_YEAR >= 2017, 'CONTRACT_SIGNED_YEAR']= 2017

# Subtract 1 from CONTRACT_SIGNED_YEAR as BEFORE_SIGNED_YEAR 
# BEFORE_SIGNED_YEAR used to look up season stats before contract was signed
df['BEFORE_SIGNED_YEAR'] = df.CONTRACT_SIGNED_YEAR - 1

In [16]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df[51:75]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    try:
        html = urlopen(url)
    except:
        print(url)
    soup = BeautifulSoup(html, 'html5lib')
    
    # Get player data and column headers
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=1)[0].findAll('th')]

    stats_df = pd.DataFrame(player_data, columns=column_headers[1:])
    
    # Insert Player Age
    stats_df.Age = pd.to_numeric(stats_df.Age, errors='coerce')
    try:
        stats_df = stats_df.loc[stats_df.Age == column.AGE]
    except:
            print(url)
            print(column.AGE)
    # Insert Player Name
    stats_df['PLAYER_NAME'] = column.PLAYER_NAME
    
    # Append to main dataframe
    player_stats_df = player_stats_df.append(stats_df, ignore_index=True)

player_stats_df.tail()

Unnamed: 0,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PLAYER_NAME
19,32.0,DEN,NBA,PF,7,7,31.7,5.3,12.7,0.416,...,1.9,6.0,7.9,1.9,0.6,0.9,1.9,2.6,14.1,Paul Millsap
20,33.0,WAS,NBA,C,6,6,31.0,5.3,9.3,0.571,...,2.3,7.3,9.7,1.8,0.5,1.0,1.2,3.2,12.3,Marcin Gortat
21,31.0,BRK,NBA,SF,6,6,27.5,4.5,10.0,0.45,...,1.2,5.2,6.3,1.5,1.7,0.5,1.0,2.3,14.2,DeMarre Carroll
22,32.0,CLE,NBA,SG,7,4,25.3,2.0,7.6,0.264,...,0.6,2.4,3.0,1.9,0.7,0.1,1.0,2.6,5.4,JR Smith
23,31.0,SAC,NBA,PG,5,5,28.6,3.6,7.4,0.486,...,0.6,2.2,2.8,3.0,0.8,0.0,2.6,1.6,9.8,George Hill


In [None]:
# player_stats_df.to_csv(player_stats.csv, index=False)
player_stats_df.PLAYER_NAME