## Web Scraping Basketball-Reference.com

### Load Packages

In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Web Scraping

In [2]:
# Chrome webdriver
driver = webdriver.Chrome('/Users/alexcheng/Downloads/chromedriver')

In [3]:
url = "http://www.basketball-reference.com/players/a/adamsst01.html"

driver.get(url)

assert "Basketball" in driver.title

### Player Name

In [4]:
name = driver.find_elements_by_xpath("""//*[@id="meta"]/div[2]/h1""") 

### Per Game Table

In [7]:
# Share & More
driver.find_element_by_xpath("""//*[@id="all_per_game"]/div[1]/div/ul/li[1]/span""").click()

In [8]:
# Get Table as CSV (for Excel)
driver.find_element_by_xpath("""//*[@id="all_per_game"]/div[1]/div/ul/li[1]/div/ul/li[3]/button""").click()

In [9]:
# Table
driver.find_element_by_class_name("""table_outer_container""")

<selenium.webdriver.remote.webelement.WebElement (session="ef78ad5b4257f34d34047aa2d5329013", element="0.9074882648578315-4")>

In [10]:
# Capture CSV Text
per_game = driver.find_element_by_id("csv_per_game")

### Shooting Table

In [11]:
# Share & More
driver.find_element_by_xpath("""//*[@id="all_shooting"]/div[1]/div/ul/li[2]/span""").click()

In [12]:
# Get Table as CSV (for Excel)
driver.find_element_by_xpath("""//*[@id="all_shooting"]/div[1]/div/ul/li[2]/div/ul/li[3]/button""").click()

In [13]:
# Table
driver.find_element_by_class_name("""table_outer_container""")

<selenium.webdriver.remote.webelement.WebElement (session="ef78ad5b4257f34d34047aa2d5329013", element="0.9074882648578315-4")>

In [14]:
# Capture CSV Text
shooting = driver.find_element_by_id("csv_shooting")

### Advanced Stats Table

In [15]:
# Share & More
driver.find_element_by_xpath("""//*[@id="all_advanced"]/div[1]/div/ul/li[1]/span""").click()

In [16]:
# Get Table as CSV (for Excel)
driver.find_element_by_xpath("""//*[@id="all_advanced"]/div[1]/div/ul/li[1]/div/ul/li[3]/button""").click()

In [17]:
# Table
driver.find_element_by_class_name("""table_outer_container""")

<selenium.webdriver.remote.webelement.WebElement (session="ef78ad5b4257f34d34047aa2d5329013", element="0.9074882648578315-4")>

In [27]:
# Capture CSV Text
advanced = driver.find_element_by_id("csv_advanced")

### Data Cleaning

#### Player Name

In [19]:
# Cleaning the player name
for value in name:
    name = value.text
    name = str(name.encode('ascii'))

#### Per Game Table

In [20]:
# Clean the per_game table
per_game = per_game.text.encode('ascii').split()[-1][14:]
per_game

'282,218,23.2,2.9,5.2,.568,0.0,0.0,.000,2.9,5.1,.570,.568,1.4,2.5,.582,2.6,3.8,6.3,0.8,0.6,1.0,1.3,2.7,7.3'

In [21]:
per_game_list = []
for i in per_game.split(','):
    i = float(i)
    per_game_list.append(i)

print per_game_list

[282.0, 218.0, 23.2, 2.9, 5.2, 0.568, 0.0, 0.0, 0.0, 2.9, 5.1, 0.57, 0.568, 1.4, 2.5, 0.582, 2.6, 3.8, 6.3, 0.8, 0.6, 1.0, 1.3, 2.7, 7.3]


#### Shooting Table

In [22]:
# Clean the per_game table
shooting = shooting.text.encode('ascii').split()[-1][14:]
print shooting

282,6553,.568,3.1,.997,.635,.325,.034,.003,.003,.570,.649,.441,.380,.200,.000,.692,.224,278,,.000,,2,0


In [23]:
shooting_list = []

for i in shooting.split(','):
    if i == '':
        shooting_list.append(0.0)
    else:
        i = float(i)
        shooting_list.append(i)
        
print shooting_list

[282.0, 6553.0, 0.568, 3.1, 0.997, 0.635, 0.325, 0.034, 0.003, 0.003, 0.57, 0.649, 0.441, 0.38, 0.2, 0.0, 0.692, 0.224, 278.0, 0.0, 0.0, 0.0, 2.0, 0.0]


#### Advanced Stats Table

In [28]:
advanced = advanced.text.encode('ascii').split()[-1][14:]
print advanced

282,6553,14.7,.585,.003,.479,12.6,17.1,14.9,5.1,1.3,3.4,16.8,13.9,,9.3,8.7,18.0,.132,,-0.7,1.6,0.9,4.8


In [29]:
advanced_list = []

for i in advanced.split(','):
    if i == '':
        advanced_list.append(0.0)
    else:
        i = float(i)
        advanced_list.append(i)

del advanced_list[14]
del advanced_list[19]

print advanced_list

[282.0, 6553.0, 14.7, 0.585, 0.003, 0.479, 12.6, 17.1, 14.9, 5.1, 1.3, 3.4, 16.8, 13.9, 9.3, 8.7, 18.0, 0.132, 0.0, 1.6, 0.9, 4.8]


### Combine the Dataframes

In [30]:
# Combine and flatten dataframe
df = [per_game_list, shooting_list, advanced_list]

# Create player stats rows to be joined to dataframe later
player_stats = [name]
for sublist in df:
    for val in sublist:
        player_stats.append(val)
        
print player_stats

['Steven Adams', 282.0, 218.0, 23.2, 2.9, 5.2, 0.568, 0.0, 0.0, 0.0, 2.9, 5.1, 0.57, 0.568, 1.4, 2.5, 0.582, 2.6, 3.8, 6.3, 0.8, 0.6, 1.0, 1.3, 2.7, 7.3, 282.0, 6553.0, 0.568, 3.1, 0.997, 0.635, 0.325, 0.034, 0.003, 0.003, 0.57, 0.649, 0.441, 0.38, 0.2, 0.0, 0.692, 0.224, 278.0, 0.0, 0.0, 0.0, 2.0, 0.0, 282.0, 6553.0, 14.7, 0.585, 0.003, 0.479, 12.6, 17.1, 14.9, 5.1, 1.3, 3.4, 16.8, 13.9, 9.3, 8.7, 18.0, 0.132, 0.0, 1.6, 0.9, 4.8]


#### Columns

In [32]:
# Column names for the three tables
per_game_cols = ['Player', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
                 '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 
                 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

shooting_cols = ['G', 'MP', 'FG%', 'AVG_DIST_FGA', '%FGA_2P', '%FGA_0-3ft', 
                 '%FGA_3-10ft','%FGA_10-16ft', '%FGA_16ft<3', '%FGA_3P', '2P%',
                 '0-3_FG%', '3-10_FG%', '10-16_FG%', '16<3_FG%', '3P%', '%ASTd_2P', 
                 '%FGA_DUNK', 'DUNKS', '%ASTd_3P', '%_CORNER3PA', '3P%_CORNER3', 
                 'HEAVE_ATT', 'HEAVE_MD']

advanced_cols = ['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
                 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
                 'WS/48', 'OBPM', 'DPM', 'BPM', 'VORP']

# Join lists together
cols = per_game_cols + shooting_cols + advanced_cols

In [34]:
# Temporary List
tmp_list = []

In [35]:
# Append to tmp_list
tmp_list.append(player_stats)

# Create dataframe
nba_player_stats = pd.DataFrame(tmp_list, columns=cols)

### Final Dataframe

In [36]:
nba_player_stats

Unnamed: 0,Player,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DPM,BPM,VORP
0,Steven Adams,282.0,218.0,23.2,2.9,5.2,0.568,0.0,0.0,0.0,...,16.8,13.9,9.3,8.7,18.0,0.132,0.0,1.6,0.9,4.8
