In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
from unidecode import unidecode
# Goal is to 1st create CSV of year player signed contract to be used for Web Scraper

In [2]:
filename = "raw_data\\NBA_current_contracts.csv"
df = pd.read_csv(filename, parse_dates=True, encoding = "ISO-8859-1")

In [3]:
# Remove unneeded columns from df
columns_not_needed = ['POS', 'TEAM', 'GUARANTEED', '% GTD', 'DOLLARS','AVG. SALARY']
df = df.drop(columns_not_needed, axis='columns')
# Tidy FREE AGENT column to CONTRACT_END_YEAR
# Tidy YRS to CONTRACT_LENGTH_YEAR
# Tidy PLAYER to PLAYER_NAME
df.rename(columns={'FREE AGENT':'CONTRACT_END_YEAR',
                   'YRS':'CONTRACT_LENGTH_YEAR',
                   'PLAYER':'PLAYER_NAME'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 4 columns):
PLAYER_NAME             560 non-null object
AGE                     560 non-null int64
CONTRACT_LENGTH_YEAR    560 non-null int64
CONTRACT_END_YEAR       560 non-null int64
dtypes: int64(3), object(1)
memory usage: 17.6+ KB


In [4]:
# Clean up PLAYER string
df.PLAYER_NAME = [name[0] for name in df.PLAYER_NAME.str.split('(')]
df.PLAYER_NAME.head()

0    Russell Westbrook 
1        Stephen Curry 
2        Blake Griffin 
3         James Harden 
4            John Wall 
Name: PLAYER_NAME, dtype: object

# Start on cell below to eliminate unicode text and chance to ascii

In [5]:
# In order to web scrape basketball-reference.com we need to create the below names

# Clean PLAYER_NAME to remove initials (J.J. Reddick), 
# dashes (Kidd-Gilchrist) and apostrophes (E'Twaun Moore)
df.PLAYER_NAME = df.PLAYER_NAME.str.replace('.', '')
df.PLAYER_NAME = df.PLAYER_NAME.str.replace('-', '')
df.PLAYER_NAME = df.PLAYER_NAME.str.replace("'", '')

# Remove unicode characters and change to ASCII for scraping
df.PLAYER_NAME = [unidecode(df.PLAYER_NAME[i]) for i in df.index]

# PLAYER_LAST_INITIAL representing the players last initial
df['PLAYER_LAST_INITIAL'] = [name[1][0].lower() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_LAST_NAME_FIRST_FIVE representing the first 5 letters of a player's
# last name
df['PLAYER_LAST_NAME_FIRST_FIVE'] = [name[1][0:5].lower().rstrip() for name in df.PLAYER_NAME.str.split(' ')]

# PLAYER_FIRST_NAME_FIRST_TWO representing the first 2 letters of a player's
# first name
df['PLAYER_FIRST_NAME_FIRST_TWO'] = [name[0][0:2].lower() for name in df.PLAYER_NAME.str.split(' ')]
df.head()

Unnamed: 0,PLAYER_NAME,AGE,CONTRACT_LENGTH_YEAR,CONTRACT_END_YEAR,PLAYER_LAST_INITIAL,PLAYER_LAST_NAME_FIRST_FIVE,PLAYER_FIRST_NAME_FIRST_TWO
0,Russell Westbrook,28,5,2023,w,westb,ru
1,Stephen Curry,29,5,2022,c,curry,st
2,Blake Griffin,28,5,2022,g,griff,bl
3,James Harden,28,4,2023,h,harde,ja
4,John Wall,27,4,2023,w,wall,jo


In [6]:
# Calculate the year contract was signed as CONTRACT_SIGNED_YEAR
df['CONTRACT_SIGNED_YEAR'] = df.CONTRACT_END_YEAR - df.CONTRACT_LENGTH_YEAR

# All BEFORE_SIGNED_YEAR >= 2017 is set to 2017 as the data used in this
# notebook was gathered in 2017 so the year the contract signed is 2017
# This may occur as players already under contract may sign an extension
# The extension begins after the currenct contract ends which may be in the 
# future. 
df.loc[df.CONTRACT_SIGNED_YEAR >= 2017, 'CONTRACT_SIGNED_YEAR']= 2017

# Subtract 1 from CONTRACT_SIGNED_YEAR as BEFORE_SIGNED_YEAR 
# BEFORE_SIGNED_YEAR used to look up season stats before contract was signed
df['BEFORE_SIGNED_YEAR'] = df.CONTRACT_SIGNED_YEAR - 1

In [36]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df #[50:100]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Create empty missing player stats dataframe
missing_players = []

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    try:
        html = urlopen(url)
    except Exception as e:
        missing_players.append(column.PLAYER_NAME)
        print(column.PLAYER_NAME)
        print(url)

Cedi Osman 
http://www.basketball-reference.com/players/o/osmance01.html
Frank Ntilinkina 
http://www.basketball-reference.com/players/n/ntilifr01.html
Clint Capela 
http://www.basketball-reference.com/players/c/capelcl01.html
Edrice Adebayo 
http://www.basketball-reference.com/players/a/adebaed01.html
Luc Richard Mbah a Moute 
http://www.basketball-reference.com/players/r/richalu01.html
Metta World Peace 
http://www.basketball-reference.com/players/w/worldme01.html
Sheldon McLellan 
http://www.basketball-reference.com/players/m/mclelsh01.html
George De Paula 
http://www.basketball-reference.com/players/d/dege01.html
Maxi Kleber 
http://www.basketball-reference.com/players/k/klebema01.html
Naz Long 
http://www.basketball-reference.com/players/l/longna01.html


# List is short so for sake of time I will delete from df and manually add by hand later

In [37]:
print(missing_players)
for name in missing_players:
    df = df[df.PLAYER_NAME != name]
    print(name)
missing_players_df = pd.DataFrame(missing_players)
missing_players_df.to_csv('missing_players.csv')

['Cedi Osman ', 'Frank Ntilinkina ', 'Clint Capela ', 'Edrice Adebayo ', 'Luc Richard Mbah a Moute ', 'Metta World Peace ', 'Sheldon McLellan ', 'George De Paula ', 'Maxi Kleber ', 'Naz Long ']
Cedi Osman 
Frank Ntilinkina 
Clint Capela 
Edrice Adebayo 
Luc Richard Mbah a Moute 
Metta World Peace 
Sheldon McLellan 
George De Paula 
Maxi Kleber 
Naz Long 


In [46]:
df2 = df
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550 entries, 0 to 559
Data columns (total 9 columns):
PLAYER_NAME                    550 non-null object
AGE                            550 non-null int64
CONTRACT_LENGTH_YEAR           550 non-null int64
CONTRACT_END_YEAR              550 non-null int64
PLAYER_LAST_INITIAL            550 non-null object
PLAYER_LAST_NAME_FIRST_FIVE    550 non-null object
PLAYER_FIRST_NAME_FIRST_TWO    550 non-null object
CONTRACT_SIGNED_YEAR           550 non-null int64
BEFORE_SIGNED_YEAR             550 non-null int64
dtypes: int64(5), object(4)
memory usage: 63.0+ KB


In [50]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df #[50:100]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Create empty missing player stats dataframe
missing_player_data = []

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html5lib')
        
    # Get player data and column headers
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                       for i in range(len(data_rows))]
    try:
        column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=1)[0].findAll('th')]
    except Exception as e:
        print(column.PLAYER_NAME)
        print(e)
        print(url)
        missing_player_data.append(column.PLAYER_NAME)

Josh Jackson 
list index out of range
http://www.basketball-reference.com/players/j/jacksjo01.html
Davon Reed 
list index out of range
http://www.basketball-reference.com/players/r/reedda01.html
Justin Patton 
list index out of range
http://www.basketball-reference.com/players/p/pattoju01.html
Harry Giles 
list index out of range
http://www.basketball-reference.com/players/g/gilesha01.html
Frank Jackson 
list index out of range
http://www.basketball-reference.com/players/j/jacksfr01.html
Tyler Lydon 
list index out of range
http://www.basketball-reference.com/players/l/lydonty01.html
Rade Zagorac 
list index out of range
http://www.basketball-reference.com/players/z/zagorra01.html
Thomas Bryant 
list index out of range
http://www.basketball-reference.com/players/b/bryanth01.html
Cameron Oliver 
list index out of range
http://www.basketball-reference.com/players/o/oliveca01.html
Kennedy Meeks 
list index out of range
http://www.basketball-reference.com/players/m/meekske01.html
LJ Peak 


In [51]:
missing_player_data

['Josh Jackson ',
 'Davon Reed ',
 'Justin Patton ',
 'Harry Giles ',
 'Frank Jackson ',
 'Tyler Lydon ',
 'Rade Zagorac ',
 'Thomas Bryant ',
 'Cameron Oliver ',
 'Kennedy Meeks ',
 'LJ Peak ',
 'Jeremy Morgan ',
 'Luke Petrasek ',
 'Erik McCree ',
 'Landry Nnoko ',
 'Jeremy Senglin ',
 'Milton Doyle ',
 'Jordan Mathews ',
 'Alex Hamilton ',
 'Jacob Pullen ',
 'Akil Mitchell ',
 'Bryce Alford ',
 'Rashawn Thomas ',
 'Amida Brimah ',
 'Taylor Braun ',
 'Bronson Koenig ',
 'TJ Williams ',
 'Trey McKinneyJones ',
 'Isaiah Hicks ',
 'Tyrone Wallace ',
 'Nigel Hayes ',
 'Ben Moore ',
 'Jamel Artis ',
 'Antonius Cleveland ',
 'Xavier RathanMayes ',
 'Andrew White ',
 'Daniel Dixon ',
 'Troy Caupain ',
 'LaDontae Henton ',
 'Peter Jok ',
 'London Perrantes ',
 'Amile Jefferson ',
 'Isaiah Briscoe ',
 'Melo Trimble ',
 'Terry Henderson ',
 'VJ Beachem ',
 'Luke Kornet ']

In [None]:
    stats_df = pd.DataFrame(player_data, columns=column_headers[1:])

# REAL CODE BELOW

In [16]:
# Set the URL Template
url_template = 'http://www.basketball-reference.com/players/{PLAYER_LAST_INITIAL}/{PLAYER_LAST_NAME_FIRST_FIVE}{PLAYER_FIRST_NAME_FIRST_TWO}01.html'

# SLICE DF JUST FOR WORK. DONT FORGET TO DELETE ME
df2 = df #[50:100]

# Create empty main player stats dataframe
player_stats_df = pd.DataFrame()

# Create empty missing player stats dataframe
missing_stats_df = pd.DataFrame()
missing_stats_df['PLAYER_NAME'] = ''

# Iterate over all rows in df pulling out player name in order to input into url_template to scrape basketball-reference.com
for index, column in df2.iterrows():
    url = url_template.format(PLAYER_LAST_INITIAL=column.PLAYER_LAST_INITIAL,
                              PLAYER_LAST_NAME_FIRST_FIVE=column.PLAYER_LAST_NAME_FIRST_FIVE,
                              PLAYER_FIRST_NAME_FIRST_TWO=column.PLAYER_FIRST_NAME_FIRST_TWO)
    try:
        html = urlopen(url)
    except Exception as e:
        missing_stats_df['PLAYER_NAME'].append(column.PLAYER_NAME)
    soup = BeautifulSoup(html, 'html5lib')
    
    # Get player data and column headers
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    
    # Capture players that do not have webpages in missing player stats dataframe
    try:
        column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=1)[0].findAll('th')]
    except Exception as e:
        missing_stats_df['PLAYER_NAME'].append(column.PLAYER_NAME)
        missing_stats_df['e'].append(e)
        missing_stats_df['url'].append(url)
    
    stats_df = pd.DataFrame(player_data, columns=column_headers[1:])
    
    # Eliminate non-age entries in Age column by removing any rows with more 
    # then 2 characters as age is 2 character entry (25, 28...)
    # note these entries use the season instead of player age as the player 
    # did not play in NBA due to injury or not being under contract
    try:
        non_age = [key for key, value in stats_df.Age.iteritems() if len(value)==2]
        stats_df = stats_df.iloc[non_age]
    except Exception as e:
            print(type(stats_df.Age))
            print(url)
            print(column.AGE)
            print(e)  
    stats_df.Age = pd.to_numeric(stats_df.Age)
    
    # If no data exists for the age in which a player signed his contract that 
    # means he was not playing in the NBA that year (injury or no contract) 
    # Go back one year to look up previous years stats
    #contract_age = column.AGE    
    
    #for age in stats_df.Age:
    #    try:
    #        stats_df = stats_df.loc[age == contract_age]
    #    except Exception as e:
    #        print(url)
    #        print(contract_age)
    #        contract_age -= 1
        

   # Insert Player Name
#    stats_df['PLAYER_NAME'] = column.PLAYER_NAME
    
    # Append to main dataframe
#    player_stats_df = player_stats_df.append(stats_df, ignore_index=True)

#player_stats_df.tail()

TypeError: cannot concatenate a non-NDFrame object

In [None]:
# player_stats_df.to_csv(player_stats.csv, index=False)
player_stats_df.PLAYER_NAME