**The purpose of this notebook is to crawl through a list of pitchers and grab all seasons with a listed salary for the seasons between 1985-2017 (inclusive)**

In [12]:
import sys
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
import os
import pickle
import string

from datetime import datetime

sns.set()
%matplotlib inline

delay_speed = 3 # seconds

In [13]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver


In [14]:
# Data cells
root_url = 'https://www.baseball-reference.com' #player url starts with a /

lower_bound_year = 1985
upper_bound_year = 2017

team_codes = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'FLA', 'HOU', 'KCR',
             'ANA', 'LAD', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SF', 'STL', 'SEA', 'TBD',
             'TEX', 'TOR', 'WSN']



In [15]:
with open ('allpitcherurls', 'rb') as fp:
    all_pitches_urls = pickle.load(fp)

In [16]:
#Partition the players by alphabet so we can save in chunks in case something goes wrong (or we run out of memory)
all_pitches_urls = sorted(list(all_pitches_urls))
alphabet_strings = {url.split('/')[2] for url in all_pitches_urls}
partitioned_pitchers_by_alphabet = {alpha_char: [] for alpha_char in alphabet_strings}
for url in all_pitches_urls:
    partitioned_pitchers_by_alphabet[url.split('/')[2]].append(url)


In [17]:
def get_text_from_BR_url(url):
    driver.get(url)
    #if response.status_code == 200:
    return driver.page_source

def br_wait():
    time.sleep(delay_speed)
    
def get_deepest_node(soup_element):
    descendents = soup_element.descendents
    if descendents:
        return descendents[-1]
    else:
        return soup_element
    
def get_player_page_soup(player_url):
    full_url = root_url + player_url
    page_text = get_text_from_BR_url(full_url)
    if not page_text:
        return None
    
    soup = BeautifulSoup(page_text, "lxml")
    return soup

def is_valid_stat_row(row_soup):
    row_classes = set(row_soup.get("class"))
    # hidden rows are minor league rows, spacer rows are for years out of the league (injury, suspension, etc)
    invalid_row_classes = {'hidden','spacer'}
    # If these sets have an intersection, that means row_classes is not a stat row
    return len(invalid_row_classes.intersection(row_classes)) == 0

def id_from_url(url):
    #'/players/m/maddugr01.shtml'
    return url.split('/')[-1].split('.')[0]

In [55]:
#header scraping code
id_player_header = 'info'

def get_player_header(soup):
    return soup.find('div', id=id_player_header)

def get_name(soup):
    return soup.text
    
def get_throwing_hand(soup):
    throws_elem = soup.find('strong', text='Throws: ')
    return throws_elem.next_sibling.split()[0]

def get_weight_height(soup):
    height_weight = soup.find_all('span')
    #0 = height, 1 = weight
    return height_weight[0].text, height_weight[1].text

def get_weight_height_index(children):
    regular_index = 3
    index_if_misc_note = 4
    
    potential_height_weight = children[regular_index].find_all('span')
    if len(potential_height_weight) == 0:
        return index_if_misc_note
    else:
        return regular_index

def scrape_header(soup):
    header_soup = get_player_header(soup)
    
    anchor_element = None
    head_children = header_soup.findChildren(recursive=False)[0].findChildren(recursive=False)
    # Need to offset our indices for players without pictures
    first_child_classes = head_children[0].get("class")
    if first_child_classes is not None and 'media-item' in first_child_classes:
        anchor_element = head_children[1]
    else:
        anchor_element = head_children[0]
    
    children = anchor_element.findChildren(recursive=False)
    
    name_index = 0
    bats_throws_index = 2
    hw_index = get_weight_height_index(children)
    
    
    name_node = children[name_index]
    name = get_name(name_node)
    
    bats_throws_node = children[bats_throws_index]
    throwing_hand = get_throwing_hand(bats_throws_node)
    
    weight_height_node = children[hw_index]
    height, weight = get_weight_height(weight_height_node)
    return [name, throwing_hand, height, weight]


In [46]:
# Collecting Pitchers

id_pitching_table = 'pitching_standard'
id_pitching_value = 'pitching_value'

def get_pitching_standard_table(soup):
    return soup.find('table', id=id_pitching_table)

def get_pitching_value_table(soup):
    return soup.find('table', id=id_pitching_value)

def scrape_salaries(soup):
    year_salary_dict = {}
    value_table_soup = get_pitching_value_table(soup)
    
    if value_table_soup:
        for row in value_table_soup.find('tbody').findChildren(recursive=False): # tr elements
            if is_valid_stat_row(row):
                cells = row.findChildren(recursive=False)
                year = int(cells[0].text)
                # First check if this is a row for 1985-2017
                if year >= lower_bound_year and year <= upper_bound_year:
                    # last cell is awards, second to last is salary
                    salary = cells[-2].text
                    if salary:
                        stripped_slary = salary.strip()
                        if len(stripped_slary) > 0:
                            year_salary_dict[year] = stripped_slary
                    
    return year_salary_dict

def scrape_yearly_pitching_stats(soup):
    found_salaries = scrape_salaries(soup)
    # we only want to bother with the stat table if they have at least one season with a listed salary
    if found_salaries:
        standard_pitching_stats = scrape_standard_pitching_stats(soup, found_salaries)
        return standard_pitching_stats
    else:
        return None

def scrape_standard_pitching_stats(soup, salaries):
    table = get_pitching_standard_table(soup)
    if table:
        rows_of_stats = []
        for row in table.find('tbody').findChildren(recursive=False): # tr elements
            if is_valid_stat_row(row):
                cells = row.findChildren(recursive=False)
                year = int(cells[0].text)
                # don't bother scraping the row unless we have a salary for that year
                if year in salaries:
                    stats_for_year = [get_deepest_node(stat).text for stat in cells[1:]]
                    #add the year as the first element
                    stats_for_year.insert(0, year)
                    #add the salary as the last element
                    stats_for_year.append(salaries[year])
                    
                    rows_of_stats.append(stats_for_year)
                    
        return rows_of_stats
    return None
                    
    
def scrape_pitcher_career_stats(player_url):
    soup = get_player_page_soup(player_url)
    yearly_stats = scrape_yearly_pitching_stats(soup)
    # yearly_stats will be None if no salary data exists (cases such as players who only ever got a midseason callup)
    if yearly_stats:
        player_id = id_from_url(player_url)
        header = scrape_header(soup)
        header.insert(0,player_id)
        # now we merge the pitcher_header level stats (height, weight, etc) with the yearly stats for pandas
        full_list = [header + stat_list for stat_list in yearly_stats]
        return full_list
    else:
        return None

In [20]:
def scrape_pitcher_standard_headers():
    soup = get_player_page_soup('/players/m/maddugr01.shtml')
    table = get_pitching_standard_table(soup)
    if table:
        children = table.find('thead').find('tr').findChildren()
        return [child.text for child in children]
    else:
        return None

In [21]:
driver = webdriver.Chrome(chromedriver)
stat_column_headers = scrape_pitcher_standard_headers()

player_main_information_headers = ['ID', 'NAME','THROWS','HEIGHT','WEIGHT']
full_list_of_columns = player_main_information_headers + stat_column_headers + ['SALARY']
#make them all caps
full_list_of_columns = [x.upper() for x in full_list_of_columns]


time_benchmarking = ['START: {0}'.format(str(datetime.now()))]
for letter, urls in partitioned_pitchers_by_alphabet.items():
    
    full_stats = []
    for url in urls:
        print(url)
        new_stats = scrape_pitcher_career_stats(url)
        if new_stats is not None and new_stats:
            full_stats = full_stats + new_stats
        br_wait()
    
    pitchers_df = pd.DataFrame(full_stats, columns=full_list_of_columns)
    pitchers_df = pitchers_df.apply(pd.to_numeric, errors='ignore')
    csv_name = 'pitchers/pitchers_{0}.csv'.format(letter)
    pitchers_df.to_csv(csv_name)
    time_benchmarking.append('{0} COMPLETE: {1}'.format(letter, str(datetime.now())))

time_benchmarking.append('SCRAPE COMPLETE: {0}'.format(str(datetime.now())))


/players/l/labouja01.shtml
/players/l/lackejo01.shtml
/players/l/lacosmi01.shtml
/players/l/lacyke01.shtml
/players/l/laddpe01.shtml
/players/l/ladenty01.shtml
/players/l/laffeaa01.shtml
/players/l/lafrobo01.shtml
/players/l/lahtije01.shtml
/players/l/lakerti01.shtml
/players/l/lamarry01.shtml
/players/l/lambech01.shtml
/players/l/lambjo02.shtml
/players/l/lametdi01.shtml
/players/l/lampde01.shtml
/players/l/lancale01.shtml
/players/l/landrbi01.shtml
/players/l/laneja01.shtml
/players/l/langfri01.shtml
/players/l/langsma01.shtml
/players/l/langwma01.shtml
/players/l/lankffr01.shtml
/players/l/lannajo01.shtml
/players/l/lapoida01.shtml
/players/l/laraju01.shtml
/players/l/larayo01.shtml
/players/l/larkian01.shtml
/players/l/larocad01.shtml
/players/l/laskebi01.shtml
/players/l/lathabi01.shtml
/players/l/latosma01.shtml
/players/l/lavelga01.shtml
/players/l/lawde01.shtml
/players/l/lawrebr02.shtml
/players/l/lawreca01.shtml
/players/l/lawrese01.shtml
/players/l/lawva01.shtml
/players/l/l

/players/c/chafian01.shtml
/players/c/chambjo03.shtml
/players/c/chapida01.shtml
/players/c/chapmar01.shtml
/players/c/chapmja02.shtml
/players/c/chapmke02.shtml
/players/c/chargjt01.shtml
/players/c/charlno01.shtml
/players/c/chatwty01.shtml
/players/c/chavean01.shtml
/players/c/chaveje01.shtml
/players/c/checoro01.shtml
/players/c/chenbr01.shtml
/players/c/chenwe02.shtml
/players/c/cherrro01.shtml
/players/c/chiamsc01.shtml
/players/c/chiassc01.shtml
/players/c/chicktr01.shtml
/players/c/chicoma01.shtml
/players/c/childja01.shtml
/players/c/childma01.shtml
/players/c/childro01.shtml
/players/c/chitrst01.shtml
/players/c/choatra01.shtml
/players/c/choji01.shtml
/players/c/chouibo01.shtml
/players/c/chrisja01.shtml
/players/c/chrismi02.shtml
/players/c/chrisni01.shtml
/players/c/christi01.shtml
/players/c/chulkvi01.shtml
/players/c/ciardma01.shtml
/players/c/cimorfr01.shtml
/players/c/cingrto01.shtml
/players/c/cirilje01.shtml
/players/c/cishest01.shtml
/players/c/cisnejo01.shtml
/play

/players/f/frankwa01.shtml
/players/f/frascjo01.shtml
/players/f/frasewi01.shtml
/players/f/frasoja01.shtml
/players/f/frazige01.shtml
/players/f/fredeke01.shtml
/players/f/fredrsc01.shtml
/players/f/freelky01.shtml
/players/f/freemju02.shtml
/players/f/freemma02.shtml
/players/f/freemmi01.shtml
/players/f/freemsa01.shtml
/players/f/frenclu01.shtml
/players/f/freyst01.shtml
/players/f/friasca01.shtml
/players/f/friedch01.shtml
/players/f/friedma01.shtml
/players/f/frierer01.shtml
/players/f/frohwto01.shtml
/players/f/frutoem01.shtml
/players/f/fryja01.shtml
/players/f/fuentbr01.shtml
/players/f/fujikky01.shtml
/players/f/fukumka01.shtml
/players/f/fulchje01.shtml
/players/f/fuldsa01.shtml
/players/f/fulmeca01.shtml
/players/f/fulmemi01.shtml
/players/f/fultobi01.shtml
/players/f/fultzaa01.shtml
/players/f/funkto01.shtml
/players/f/furbuch01.shtml
/players/f/fussech01.shtml
/players/f/fyhrimi01.shtml
/players/d/d'amije01.shtml
/players/d/d'amije02.shtml
/players/d/daalom01.shtml
/player

/players/o/oswalro01.shtml
/players/o/oteroda01.shtml
/players/o/otsukak01.shtml
/players/o/ottavad01.shtml
/players/o/ottoda01.shtml
/players/o/outmajo01.shtml
/players/o/overbly01.shtml
/players/o/overtdi01.shtml
/players/o/owchibo01.shtml
/players/o/owenshe01.shtml
/players/o/owenshe02.shtml
/players/o/owensru01.shtml
/players/o/owingmi01.shtml
/players/o/ownberi01.shtml
/players/o/oxsprch01.shtml
/players/z/zachrpa01.shtml
/players/z/zagurmi01.shtml
/players/z/zahnge01.shtml
/players/z/zambrca01.shtml
/players/z/zambrvi01.shtml
/players/z/zaratma01.shtml
/players/z/zastrro01.shtml
/players/z/zavadcl01.shtml
/players/z/zavarcl01.shtml
/players/z/zeidjo01.shtml
/players/z/zeileto01.shtml
/players/z/zerbech01.shtml
/players/z/zieglbr01.shtml
/players/z/ziemst01.shtml
/players/z/zimmeje02.shtml
/players/z/zimmejo01.shtml
/players/z/zimmejo02.shtml
/players/z/zinkch01.shtml
/players/z/zitoba01.shtml
/players/z/zumayjo01.shtml
/players/z/zychto01.shtml
/players/k/kahnlto01.shtml
/players

/players/p/pintori01.shtml
/players/p/piscima01.shtml
/players/p/pittsji01.shtml
/players/p/pivetni01.shtml
/players/p/planter01.shtml
/players/p/plaweke01.shtml
/players/p/plesada01.shtml
/players/p/plunker01.shtml
/players/p/plutkad01.shtml
/players/p/plympje01.shtml
/players/p/politcl01.shtml
/players/p/polleda01.shtml
/players/p/pomerdr01.shtml
/players/p/pomerst01.shtml
/players/p/ponsosi01.shtml
/players/p/pooleji02.shtml
/players/p/porceri01.shtml
/players/p/poredaa01.shtml
/players/p/portech01.shtml
/players/p/portuma01.shtml
/players/p/porzimi01.shtml
/players/p/potelo01.shtml
/players/p/pottsmi01.shtml
/players/p/poundbr01.shtml
/players/p/povsema01.shtml
/players/p/powelbr01.shtml
/players/p/powelde01.shtml
/players/p/powelja04.shtml
/players/p/powelje01.shtml
/players/p/powelro01.shtml
/players/p/powerte01.shtml
/players/p/prattan01.shtml
/players/p/pressry01.shtml
/players/p/pricebr10.shtml
/players/p/priceda01.shtml
/players/p/pricejo02.shtml
/players/p/priesed01.shtml
/p

/players/g/garrist01.shtml
/players/g/gartory01.shtml
/players/g/garzama01.shtml
/players/g/gassnda01.shtml
/players/g/gastjo01.shtml
/players/g/gaubjo01.shtml
/players/g/gaudich01.shtml
/players/g/gausmke01.shtml
/players/g/gavigsa01.shtml
/players/g/gearrco01.shtml
/players/g/gearyge01.shtml
/players/g/geedi01.shtml
/players/g/geerjo01.shtml
/players/g/geiseda01.shtml
/players/g/geltzst01.shtml
/players/g/gennesc01.shtml
/players/g/gentrcr01.shtml
/players/g/georgch01.shtml
/players/g/georgch02.shtml
/players/g/germado01.shtml
/players/g/germafr01.shtml
/players/g/germaju01.shtml
/players/g/germego01.shtml
/players/g/gervasa01.shtml
/players/g/gibsobo02.shtml
/players/g/gibsoky01.shtml
/players/g/gibsopa01.shtml
/players/g/gideobr01.shtml
/players/g/gieseda01.shtml
/players/g/gileske01.shtml
/players/g/gilfija01.shtml
/players/g/gilleto02.shtml
/players/g/gilmase01.shtml
/players/g/gimench01.shtml
/players/g/gintema01.shtml
/players/g/giolilu01.shtml
/players/g/giovaed01.shtml
/playe

/players/m/manaese01.shtml
/players/m/manesse01.shtml
/players/m/mannich01.shtml
/players/m/mannida01.shtml
/players/m/mannji01.shtml
/players/m/manonju01.shtml
/players/m/manonra01.shtml
/players/m/manshje01.shtml
/players/m/mantema01.shtml
/players/m/mantijo01.shtml
/players/m/manueba01.shtml
/players/m/manuero01.shtml
/players/m/manzajo01.shtml
/players/m/manzara01.shtml
/players/m/mapledi01.shtml
/players/m/marakpa01.shtml
/players/m/marcush01.shtml
/players/m/marimsu01.shtml
/players/m/marinjh01.shtml
/players/m/mariomi01.shtml
/players/m/marksju01.shtml
/players/m/marksma01.shtml
/players/m/marmoca01.shtml
/players/m/maronni01.shtml
/players/m/marotmi01.shtml
/players/m/marquge01.shtml
/players/m/marquis01.shtml
/players/m/marquja01.shtml
/players/m/marquje01.shtml
/players/m/marshbr01.shtml
/players/m/marshev01.shtml
/players/m/marshja01.shtml
/players/m/marshse01.shtml
/players/m/marsosa01.shtml
/players/m/martean01.shtml
/players/m/marteda01.shtml
/players/m/martefr01.shtml
/p

IndexError: list index out of range

In [22]:
time_benchmarking

['START: 2018-01-27 16:30:09.737425',
 'l COMPLETE: 2018-01-27 16:45:40.151534',
 'q COMPLETE: 2018-01-27 16:46:14.155163',
 'c COMPLETE: 2018-01-27 17:10:25.139594',
 'f COMPLETE: 2018-01-27 17:22:44.151287',
 'd COMPLETE: 2018-01-27 17:40:39.274916',
 'o COMPLETE: 2018-01-27 17:47:02.371573',
 'z COMPLETE: 2018-01-27 17:48:50.569941',
 'k COMPLETE: 2018-01-27 17:58:59.252835',
 'p COMPLETE: 2018-01-27 18:15:13.136324',
 'n COMPLETE: 2018-01-27 18:21:30.267155',
 'y COMPLETE: 2018-01-27 18:23:23.810476',
 'i COMPLETE: 2018-01-27 18:24:41.853260',
 'v COMPLETE: 2018-01-27 18:31:01.405120',
 'g COMPLETE: 2018-01-27 18:49:09.941607',
 'e COMPLETE: 2018-01-27 18:55:34.985330']

In [56]:
driver = webdriver.Chrome(chromedriver)
time_benchmarking3 = ['START: {0}'.format(str(datetime.now()))]
remaining = ['h', 'm']
for letter, urls in partitioned_pitchers_by_alphabet.items():
    if letter in remaining:
        full_stats = []
        for url in urls:
            print(url)
            new_stats = scrape_pitcher_career_stats(url)
            if new_stats is not None and new_stats:
                full_stats = full_stats + new_stats
            br_wait()
    
        pitchers_df = pd.DataFrame(full_stats, columns=full_list_of_columns)
        pitchers_df = pitchers_df.apply(pd.to_numeric, errors='ignore')
        csv_name = 'pitchers/pitchers_{0}.csv'.format(letter)
        pitchers_df.to_csv(csv_name)
        time_benchmarking3.append('{0} COMPLETE: {1}'.format(letter, str(datetime.now())))

time_benchmarking3.append('SCRAPE COMPLETE: {0}'.format(str(datetime.now())))

/players/m/mabeuch01.shtml
/players/m/mabryjo01.shtml
/players/m/macdomi01.shtml
/players/m/macdoro01.shtml
/players/m/machaan02.shtml
/players/m/machaju01.shtml
/players/m/machije01.shtml
/players/m/mackto01.shtml
/players/m/maclaev01.shtml
/players/m/macrasc01.shtml
/players/m/maddemi01.shtml
/players/m/maddemo01.shtml
/players/m/maddoau01.shtml
/players/m/maddugr01.shtml
/players/m/maddumi01.shtml
/players/m/madrial01.shtml
/players/m/madribo01.shtml
/players/m/madriwa01.shtml
/players/m/madsory01.shtml
/players/m/madurca01.shtml
/players/m/maedake01.shtml
/players/m/magilma01.shtml
/players/m/magnami01.shtml
/players/m/magnida01.shtml
/players/m/magnutr01.shtml
/players/m/magrajo01.shtml
/players/m/mahayro01.shtml
/players/m/mahlegr01.shtml
/players/m/mahlemi01.shtml
/players/m/mahleri01.shtml
/players/m/mahlety01.shtml
/players/m/maholpa01.shtml
/players/m/mahompa01.shtml
/players/m/maiermi01.shtml
/players/m/mainejo01.shtml
/players/m/mainesc01.shtml
/players/m/maireos01.shtml
/p

/players/m/mitchbr01.shtml
/players/m/mitchch01.shtml
/players/m/mitchdj01.shtml
/players/m/mitchjo02.shtml
/players/m/mitchla01.shtml
/players/m/mitrese01.shtml
/players/m/mlickda01.shtml
/players/m/mmahake01.shtml
/players/m/mockga01.shtml
/players/m/moehlbr01.shtml
/players/m/moellde01.shtml
/players/m/mohlemi01.shtml
/players/m/mohorda01.shtml
/players/m/molinga01.shtml
/players/m/molledu01.shtml
/players/m/mollsa01.shtml
/players/m/monasca01.shtml
/players/m/montafr02.shtml
/players/m/montara01.shtml
/players/m/monteag01.shtml
/players/m/montejo01.shtml
/players/m/montemi01.shtml
/players/m/montera01.shtml
/players/m/monteri01.shtml
/players/m/montgje01.shtml
/players/m/montgjo01.shtml
/players/m/montgmi01.shtml
/players/m/montgst01.shtml
/players/m/moodyer01.shtml
/players/m/moonebi01.shtml
/players/m/moorean02.shtml
/players/m/moorebo01.shtml
/players/m/moorebr01.shtml
/players/m/mooredo01.shtml
/players/m/moorema01.shtml
/players/m/moorema02.shtml
/players/m/mooremi01.shtml
/pl

/players/h/hirshja01.shtml
/players/h/hitchst01.shtml
/players/h/hochelu01.shtml
/players/h/hodgeke01.shtml
/players/h/hodgetr01.shtml
/players/h/hoeyja02.shtml
/players/h/hoffmgu01.shtml
/players/h/hoffmje02.shtml
/players/h/hoffmtr01.shtml
/players/h/holadbr01.shtml
/players/h/holdejo02.shtml
/players/h/holdrda01.shtml
/players/h/holdzjo01.shtml
/players/h/hollaal01.shtml
/players/h/hollade01.shtml
/players/h/hollagr01.shtml
/players/h/hollama01.shtml
/players/h/hollije01.shtml
/players/h/holmabr01.shtml
/players/h/holmabr02.shtml
/players/h/holmash01.shtml
/players/h/holmbda01.shtml
/players/h/holmeda01.shtml
/players/h/holtch01.shtml
/players/h/holtobr01.shtml
/players/h/holtty01.shtml
/players/h/holtzmi01.shtml
/players/h/holzema01.shtml
/players/h/honeyri01.shtml
/players/h/hookch01.shtml
/players/h/hootobu01.shtml
/players/h/hoovejj01.shtml
/players/h/hoovejo02.shtml
/players/h/hopejo01.shtml
/players/h/horgajo01.shtml
/players/h/horsmvi01.shtml
/players/h/horstje01.shtml
/playe

In [57]:
time_benchmarking3

['START: 2018-01-27 22:37:52.970948',
 'm COMPLETE: 2018-01-27 23:12:39.665802',
 'h COMPLETE: 2018-01-27 23:34:38.905376',
 'SCRAPE COMPLETE: 2018-01-27 23:34:38.905560']