# Web Scraping Pipeline

https://pypi.python.org/pypi/selenium

### Load Packages

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np

#### Columns

In [2]:
# Column names for the three tables
stat_cols = ['Player', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', 
             '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 
             'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 
             'PF', 'PTS']

shoot_cols = ['AVG_DIST_FGA', '%FGA_2P', '%FGA_0-3ft', '%FGA_3-10ft', '%FGA_10-16ft',
              '%FGA_16ft<3', '%FGA_3P', '2P%', '0-3_FG%', '3-10_FG%', '10-16_FG%',
              '16<3_FG%', '3P%', '%ASTd_2P', '%FGA_DUNK', 'DUNKS', '%ASTd_3P',
              '%_CORNER3PA', '3P%_CORNER3']

adv_cols = ['MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
            'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
            'WS/48', 'OBPM', 'DPM', 'BPM', 'VORP']

# Join lists together
cols = stat_cols + shoot_cols + adv_cols

### Selenium

In [3]:
# Chrome webdriver
driver = webdriver.Chrome('/Users/alexcheng/Downloads/chromedriver')

# Website URL
url = "http://www.basketball-reference.com/players/a/adamsst01.html"

# Navigate to URL
driver.get(url)

# Assert page title has "Basketball" in it
assert "Basketball" in driver.title

#### Web Scraping

In [11]:
name = driver.find_elements_by_xpath("""//*[@id="meta"]/div[2]/h1""") 
stats = driver.find_elements_by_xpath("""//*[@id="per_game"]/tfoot/tr[1]""")
advanced = driver.find_elements_by_xpath("""//*[@id="advanced"]/tfoot/tr[1]""")
shooting = driver.find_elements_by_xpath("""//*[@id="shooting"]/tfoot/tr[1]""")

#### Data Cleaning

In [12]:
# Instantiate values and clean values
for value in name:
    name = value.text
    name = str(name.encode('ascii'))

for value in stats:
    stats = value.text
    # stats = stats[11:].encode('ascii').split()

for value in shooting:
    shoot_stats = value.text
    shoot_stats = shoot_stats.encode('ascii').split()[5:-2]

for value in advanced:
    adv_stats = value.text
    adv_stats = adv_stats.encode('ascii').split()[3:]

In [13]:
stats

u'Career NBA 282 218 23.2 2.9 5.2 .568 0.0 0.0 .000 2.9 5.1 .570 .568 1.4 2.5 .582 2.6 3.8 6.3 0.8 0.6 1.0 1.3 2.7 7.3'

In [6]:
# Combine and flatten dataframe
df = [stats, shoot_stats, adv_stats]

# Create player stats rows to be joined to dataframe later
player_stats = [name]
for sublist in df:
    for val in sublist:
        player_stats.append(val)
        
print player_stats

['Steven Adams', '282', '218', '23.2', '2.9', '5.2', '.568', '0.0', '0.0', '.000', '2.9', '5.1', '.570', '.568', '1.4', '2.5', '.582', '2.6', '3.8', '6.3', '0.8', '0.6', '1.0', '1.3', '2.7', '7.3', '3.1', '.997', '.635', '.325', '.034', '.003', '.003', '.570', '.649', '.441', '.380', '.200', '.000', '.692', '.224', '278', '.000', '6553', '14.7', '.585', '.003', '.479', '12.6', '17.1', '14.9', '5.1', '1.3', '3.4', '16.8', '13.9', '9.3', '8.7', '18.0', '.132', '-0.7', '1.6', '0.9', '4.8']


In [7]:
len(player_stats)

64

In [8]:
tmp_list = []

tmp_list.append(player_stats)
df = pd.DataFrame(tmp_list, columns=cols)

AssertionError: 66 columns passed, passed data had 64 columns

### Final Dataframe

In [None]:
df