In [82]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
from unidecode import unidecode
# Goal is to 1st create CSV of year player signed contract to be used for Web Scraper

In [83]:
filename = "raw_data\\NBA_current_contracts.csv"
df = pd.read_csv(filename, parse_dates=True, encoding = "ISO-8859-1")

In [84]:
# Remove unneeded columns from df
columns_not_needed = ['POS', 'TEAM', 'GUARANTEED', '% GTD', 'DOLLARS','AVG. SALARY']
df = df.drop(columns_not_needed, axis='columns')
# Tidy FREE AGENT column to CONTRACT_END_YEAR
# Tidy YRS to CONTRACT_LENGTH_YEAR
# Tidy PLAYER to PLAYER_NAME
df.rename(columns={'FREE AGENT':'CONTRACT_END_YEAR',
                   'YRS':'CONTRACT_LENGTH_YEAR',
                   'PLAYER':'PLAYER_NAME'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 4 columns):
PLAYER_NAME             560 non-null object
AGE                     560 non-null int64
CONTRACT_LENGTH_YEAR    560 non-null int64
CONTRACT_END_YEAR       560 non-null int64
dtypes: int64(3), object(1)
memory usage: 17.6+ KB


In [85]:
# Clean up PLAYER string
df.PLAYER_NAME = [name[0] for name in df.PLAYER_NAME.str.split('(')]
df.PLAYER_NAME.head()

0    Russell Westbrook 
1        Stephen Curry 
2        Blake Griffin 
3         James Harden 
4            John Wall 
Name: PLAYER_NAME, dtype: object

# Start on cell below to eliminate unicode text and chance to ascii

In [86]:
# In order to web scrape basketball-reference.com we need to create the below names

# Clean PLAYER_NAME to remove initials (J.J. Reddick), 
# dashes (Kidd-Gilchrist) and apostrophes (E'Twaun Moore)
df.PLAYER_NAME = df.PLAYER_NAME.str.replace('.', '')
df.PLAYER_NAME = df.PLAYER_NAME.str.replace('-', '')
df.PLAYER_NAME = df.PLAYER_NAME.str.replace("'", '')

# Remove unicode characters and change to ASCII for scraping
df.PLAYER_NAME = [unidecode(df.PLAYER_NAME[i]) for i in df.index]

# PLAYER_LAST_INITIAL representing the players last initial
df['PLAYER_LAST_INITIAL'] = [name[1][0].lower() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_LAST_NAME_FIRST_FIVE representing the first 5 letters of a player's
# last name
df['PLAYER_LAST_NAME_FIRST_FIVE'] = [name[1][0:5].lower().rstrip() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_FIRST_NAME_FIRST_TWO representing the first 2 letters of a player's
# first name
df['PLAYER_FIRST_NAME_FIRST_TWO'] = [name[0][0:2].lower() for name in df.PLAYER_NAME.str.split(' ')]
df.head()

Unnamed: 0,PLAYER_NAME,AGE,CONTRACT_LENGTH_YEAR,CONTRACT_END_YEAR,PLAYER_LAST_INITIAL,PLAYER_LAST_NAME_FIRST_FIVE,PLAYER_FIRST_NAME_FIRST_TWO
0,Russell Westbrook,28,5,2023,w,westb,ru
1,Stephen Curry,29,5,2022,c,curry,st
2,Blake Griffin,28,5,2022,g,griff,bl
3,James Harden,28,4,2023,h,harde,ja
4,John Wall,27,4,2023,w,wall,jo


In [87]:
# Calculate the year contract was signed as CONTRACT_SIGNED_YEAR
df['CONTRACT_SIGNED_YEAR'] = df.CONTRACT_END_YEAR - df.CONTRACT_LENGTH_YEAR

# All BEFORE_SIGNED_YEAR >= 2017 is set to 2017 as the data used in this
# notebook was gathered in 2017 so the year the contract signed is 2017
# This may occur as players already under contract may sign an extension
# The extension begins after the currenct contract ends which may be in the 
# future. 
df.loc[df.CONTRACT_SIGNED_YEAR >= 2017, 'CONTRACT_SIGNED_YEAR']= 2017

# Subtract 1 from CONTRACT_SIGNED_YEAR as BEFORE_SIGNED_YEAR 
# BEFORE_SIGNED_YEAR used to look up season stats before contract was signed
df['BEFORE_SIGNED_YEAR'] = df.CONTRACT_SIGNED_YEAR - 1

In [None]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df#[0:100]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Create empty missing player stats dataframe
missing_stats_df = pd.DataFrame()

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    try:
        html = urlopen(url)
    except Exception as e:
        missing_stats_df['PLAYER_NAME'].append(column.PLAYER_NAME)
    soup = BeautifulSoup(html, 'html5lib')
    
    # Get player data and column headers
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    
    # Capture players that do not have webpages in missing player stats dataframe
    try:
        column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=1)[0].findAll('th')]
    except Exception as e:
        missing_stats_df['PLAYER_NAME'].append(column.PLAYER_NAME)
        missing_stats_df['e'].append(e)
        missing_stats_df['url'].append(url)
    
    stats_df = pd.DataFrame(player_data, columns=column_headers[1:])
    
    # Eliminate non-age entries in Age column by removing any rows with more 
    # then 2 characters as age is 2 character entry (25, 28...)
    # note these entries use the season instead of player age as the player 
    # did not play in NBA due to injury or not being under contract
    try:
        non_age = [key for key, value in stats_df.Age.iteritems() if len(value)==2]
        stats_df = stats_df.iloc[non_age]
    except Exception as e:
            print(type(stats_df.Age))
            print(url)
            print(column.AGE)
            print(e)  
    stats_df.Age = pd.to_numeric(stats_df.Age)
    
    # If no data exists for the age in which a player signed his contract that 
    # means he was not playing in the NBA that year (injury or no contract) 
    # Go back one year to look up previous years stats
    
    
    
    
    if stats_df.Age == column.AGE:
        stats_df = stats_df.loc[stats_df.Age == column.AGE]
    else:
        
        
        
    #player_stats_df = player_stats_df.append(stats_df, ignore_index=True)
#player_stats_df

http://www.basketball-reference.com/players/o/osmance01.html
http://www.basketball-reference.com/players/n/ntilifr01.html
http://www.basketball-reference.com/players/c/capelcl01.html
http://www.basketball-reference.com/players/a/adebaed01.html
http://www.basketball-reference.com/players/l/labissk01.html


Unnamed: 0,2P,2P%,2PA,3P,3P%,3PA,AST,Age,BLK,DRB,...,MP,ORB,PF,PTS,Pos,STL,TOV,TRB,Tm,eFG%
0,4.9,.415,11.8,0.4,.271,1.6,5.3,20,0.2,2.7,...,32.5,2.2,2.3,15.3,PG,1.3,3.3,4.9,OKC,.414
1,5.6,.438,12.9,0.3,.221,1.3,8.0,21,0.4,3.1,...,34.3,1.7,2.5,16.1,PG,1.3,3.3,4.9,OKC,.428
2,7.1,.451,15.7,0.4,.330,1.3,8.2,22,0.4,3.1,...,34.7,1.5,2.5,21.9,PG,1.9,3.9,4.6,OKC,.454
3,7.8,.482,16.2,0.9,.316,3.0,5.5,23,0.3,3.1,...,35.3,1.5,2.2,23.6,PG,1.7,3.6,4.6,OKC,.481
4,7.0,.466,15.1,1.2,.323,3.7,7.4,24,0.3,3.9,...,34.9,1.4,2.3,23.2,PG,1.8,3.3,5.2,OKC,.470
5,6.0,.482,12.5,1.5,.318,4.7,6.9,25,0.2,4.5,...,30.7,1.2,2.3,21.8,PG,1.9,3.8,5.7,OKC,.480
6,8.1,.457,17.7,1.3,.299,4.3,8.6,26,0.2,5.4,...,34.4,1.9,2.7,28.1,PG,2.1,4.4,7.3,OKC,.455
7,6.9,.503,13.8,1.3,.296,4.3,10.4,27,0.3,6.0,...,34.4,1.8,2.5,23.5,PG,2.0,4.3,7.8,OKC,.489
8,7.7,.459,16.8,2.5,.343,7.2,10.4,28,0.4,9.0,...,34.6,1.7,2.3,31.6,PG,1.6,5.4,10.7,OKC,.476
9,5.8,.479,12.0,1.5,.375,4.0,11.8,29,0.3,8.1,...,33.9,1.3,2.9,19.5,PG,1.3,5.4,9.4,OKC,.500


# REAL CODE BELOW

In [25]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df[0:100]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    try:
        html = urlopen(url)
    except:
        print(url)
    soup = BeautifulSoup(html, 'html5lib')
    
    # Get player data and column headers
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=1)[0].findAll('th')]

    stats_df = pd.DataFrame(player_data, columns=column_headers[1:])
    
    # Eliminate non-age entries in Age column by removing any rows with more 
    # then 2 characters as age is 2 character entry (25, 28...)
    # note these entries use the season instead of player age as the player 
    # did not play in NBA due to injury or not being under contract
    try:
        non_age = [key for key, value in stats_df.Age.iteritems() if len(value)==2]
        stats_df = stats_df.iloc[non_age]
    except Exception as e:
            print(type(stats_df.Age))
            print(url)
            print(column.AGE)
            print(e)            
    stats_df.Age = pd.to_numeric(stats_df.Age) #errors='coerce')
          
    # If no data exists for the age in which a player signed his contract that 
    # means he was not playing in the NBA that year (injury or no contract) 
    # Go back one year to look up previous years stats
 
    # Insert Player Age
    try:
        stats_df = stats_df.loc[stats_df.Age == column.AGE]
    except Exception as e:
    
    
    
    
    
    
    # Insert Player Name
    stats_df['PLAYER_NAME'] = column.PLAYER_NAME
    
    # Append to main dataframe
    player_stats_df = player_stats_df.append(stats_df, ignore_index=True)

player_stats_df.tail()

0          21
1          22
2     2012-13
3     2013-14
4          25
5          26
6          27
7          28
8            
9            
10           
11           
Name: Age, dtype: object
http://www.basketball-reference.com/players/w/whiteha01.html
28
0          23
1          24
2          25
3          26
4     1993-94
5          28
6          29
7          29
8          29
9          30
10         31
11         32
12         33
13         34
14         35
15         35
16         35
17         36
18           
19           
20           
21           
22           
23           
24           
Name: Age, dtype: object
http://www.basketball-reference.com/players/h/hardati01.html
25
0          21
1          21
2          21
3          22
4          23
5          24
6          24
7          24
8          25
9          26
10         27
11         28
12    1988-89
13         30
14           
15           
16           
17           
18           
19           
20           
21        

Unnamed: 0,2P,2P%,2PA,3P,3P%,3PA,AST,Age,BLK,DRB,...,ORB,PF,PLAYER_NAME,PTS,Pos,STL,TOV,TRB,Tm,eFG%
84,3.6,0.46,7.9,1.3,0.372,3.4,3.2,24.0,0.6,3.3,...,0.7,2.4,Tyler Johnson,13.7,PG,1.2,1.2,4.0,MIA,0.49
85,5.0,0.513,9.8,1.1,0.356,3.0,3.1,29.0,0.9,5.9,...,1.9,2.8,Patrick Mills,16.7,PF,1.8,2.3,7.8,ATL,0.518
86,2.8,0.49,5.6,1.4,0.401,3.5,2.3,31.0,0.3,2.7,...,0.7,1.8,Courtney Lee,10.8,SG,1.1,0.9,3.4,NYK,0.533
87,3.3,0.536,6.2,0.4,0.286,1.6,3.4,37.0,1.6,6.2,...,1.9,1.3,Pau Gasol,9.7,C,0.2,1.7,8.1,SAS,0.514
88,2.0,0.651,3.1,0.8,0.362,2.3,3.4,33.0,0.5,3.3,...,0.7,1.3,Andre Iguodala,7.6,SF,1.0,0.8,4.0,GSW,0.605


In [23]:
# player_stats_df.to_csv(player_stats.csv, index=False)
player_stats_df.PLAYER_NAME

0         Russell Westbrook 
1             Stephen Curry 
2             Blake Griffin 
3              James Harden 
4                 John Wall 
5               Mike Conley 
6            Damian Lillard 
7             DeMar DeRozan 
8              Jrue Holiday 
9            Gordon Hayward 
10             Bradley Beal 
11           Andre Drummond 
12          Carmelo Anthony 
13            Nicolas Batum 
14             James Harden 
15               Al Horford 
16               Marc Gasol 
17               Kevin Love 
18               Chris Paul 
19              CJ McCollum 
20           Otto Porter Jr 
21              Rudy Gobert 
22               Kyle Lowry 
23             Steven Adams 
24    Giannis Antetokounmpo 
25             LeBron James 
26         Hassan Whiteside 
27         Chandler Parsons 
28             Kyrie Irving 
29             Jimmy Butler 
               ...          
64              Brook Lopez 
65             Gorgui Dieng 
66             Paul Millsap 
67            