**The purpose of this notebook is to crawl the baseball-reference standard pitching/batting pages for 1985-2017 to obtain a list of links to all active pitchers and batters in that timeframe**

In [75]:
import sys
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selenium import webdriver
import os
import pickle
sns.set()
%matplotlib inline

delay_speed = 3 # seconds

In [51]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver


In [52]:
# Data cells
batting_url = 'https://www.baseball-reference.com/leagues/MLB/{0}-standard-batting.shtml'
pitching_url = 'https://www.baseball-reference.com/leagues/MLB/{0}-standard-pitching.shtml'

years_to_collect_players_from = range(1985,2018)

batting_urls = [batting_url.format(year) for year in years_to_collect_players_from]
pitching_urls = [pitching_url.format(year) for year in years_to_collect_players_from]

team_codes = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'FLA', 'HOU', 'KCR',
             'ANA', 'LAD', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SF', 'STL', 'SEA', 'TBD',
             'TEX', 'TOR', 'WSN']



In [53]:
def get_text_from_BR_url(url):
    driver.get(url)
    #if response.status_code == 200:
    return driver.page_source

def br_wait():
    time.sleep(delay_speed)
    
def get_deepest_node(soup_element):
    descendents = soup_element.descendents
    if descendents:
        return descendents[-1]
    else:
        return soup_element

In [66]:
# Collecting Pitchers
id_pitching_table = 'players_standard_pitching'

def get_player_pitching_table(soup):
    return soup.find('table', id=id_pitching_table)

def get_year_pitching_page_soup(year_url):
    page_text = get_text_from_BR_url(year_url)
    if not page_text:
        return None
    
    soup = BeautifulSoup(page_text, "lxml")
    return soup

def scrape_year_pitching_table(year_url):
    soup = get_year_pitching_page_soup(year_url)
    #print(soup.text)
    pitching_table = get_player_pitching_table(soup)
    #print(pitching_table)
    pitcher_urls = set()
    if pitching_table:
        pitching_years = pitching_table.find('tbody')
        for row in pitching_years.findChildren(recursive=False):
            children = row.findChildren(recursive=False)
            link = children[1].find('a') # Doesn't exist if it's a 'league average' row
            if link:
                pitcher_url = link['href']
                pitcher_urls.add(pitcher_url)
        return pitcher_urls
    return None

In [67]:
driver = webdriver.Chrome(chromedriver)
all_pitches_urls = set()
for pitching_year in pitching_urls:
    pitchers = scrape_year_pitching_table(pitching_year)
    all_pitches_urls = all_pitches_urls | pitchers
    br_wait()

all_pitches_urls

{'/players/p/pulidca01.shtml',
 '/players/s/smoltjo01.shtml',
 '/players/s/sturtta01.shtml',
 '/players/f/finlech01.shtml',
 '/players/r/ramirer01.shtml',
 '/players/l/lambech01.shtml',
 '/players/s/salasma02.shtml',
 '/players/g/grahajr01.shtml',
 '/players/c/carrahe01.shtml',
 '/players/m/millesh01.shtml',
 '/players/g/gaettga01.shtml',
 '/players/h/heredwi01.shtml',
 '/players/s/sanchal02.shtml',
 '/players/s/santama01.shtml',
 '/players/o/overbly01.shtml',
 '/players/r/roberke02.shtml',
 '/players/g/greench03.shtml',
 '/players/o/olsensc01.shtml',
 '/players/r/rossco01.shtml',
 '/players/b/bogusbr01.shtml',
 '/players/g/gurala01.shtml',
 '/players/k/kimbaco01.shtml',
 '/players/d/donnebr01.shtml',
 '/players/p/paulida01.shtml',
 '/players/m/morrima01.shtml',
 '/players/e/escobed01.shtml',
 '/players/k/kimbrcr01.shtml',
 '/players/n/navarja01.shtml',
 '/players/m/mendolu01.shtml',
 '/players/h/hynesco01.shtml',
 '/players/b/bedrost01.shtml',
 '/players/g/gibsopa01.shtml',
 '/players

In [68]:
len(all_pitches_urls)

4163

In [76]:

#with open('allpitcherurls', 'wb') as fp:
 #   pickle.dump(all_pitches_urls, fp)

In [None]:
#with open ('allpitcherurls', 'rb') as fp:
 #   all_pitches_urls = pickle.load(fp)

In [73]:
# Collecting Pitchers
id_batting_table = 'players_standard_batting'

def get_player_batting_table(soup):
    return soup.find('table', id=id_batting_table)

def get_year_batting_page_soup(year_url):
    page_text = get_text_from_BR_url(year_url)
    if not page_text:
        return None
    
    soup = BeautifulSoup(page_text, "lxml")
    return soup

def scrape_year_batting_table(year_url):
    soup = get_year_batting_page_soup(year_url)
    #print(soup.text)
    batting_table = get_player_batting_table(soup)
    #print(pitching_table)
    batting_urls = set()
    if batting_table:
        batting_years = batting_table.find('tbody')
        for row in batting_years.findChildren(recursive=False):
            children = row.findChildren(recursive=False)
            position_cell = children[-1]
            # 1 is the position number for pitchers
            # Don't add pitchers to this
            if '1' not in position_cell.text:
                link = children[1].find('a') # Doesn't exist if it's a 'league average' row
                if link:
                    batting_url = link['href']
                    batting_urls.add(batting_url)
        return batting_urls
    return None

In [74]:
driver = webdriver.Chrome(chromedriver)
all_batting_urls = set()
for batting_year in batting_urls:
    batters = scrape_year_batting_table(batting_year)
    all_batting_urls = all_batting_urls | batters
    br_wait()

all_batting_urls

{'/players/r/rowdowa01.shtml',
 '/players/a/atkinga01.shtml',
 '/players/w/walketo03.shtml',
 '/players/g/gardnbr01.shtml',
 '/players/r/russead02.shtml',
 '/players/u/unroeti01.shtml',
 '/players/f/figuelu01.shtml',
 '/players/m/mcgrifr01.shtml',
 '/players/b/barrema02.shtml',
 '/players/g/gaettga01.shtml',
 '/players/s/smithoz01.shtml',
 '/players/o/overbly01.shtml',
 '/players/c/cabreas01.shtml',
 '/players/f/fuentre01.shtml',
 '/players/c/clarkbo04.shtml',
 '/players/r/rossco01.shtml',
 '/players/b/bogusbr01.shtml',
 '/players/r/reddijo01.shtml',
 '/players/s/simmsmi01.shtml',
 '/players/j/johnsla03.shtml',
 '/players/s/stpiema01.shtml',
 '/players/e/escobed01.shtml',
 '/players/g/giambja01.shtml',
 '/players/c/colbecr01.shtml',
 '/players/g/girarjo01.shtml',
 '/players/r/rodrilu01.shtml',
 '/players/h/hardtja01.shtml',
 '/players/b/bordepa01.shtml',
 '/players/m/mooreke02.shtml',
 '/players/h/hechaad01.shtml',
 '/players/c/chrisju01.shtml',
 '/players/d/dwyerji01.shtml',
 '/player

In [78]:
# 3513 len(all_batting_urls) 

#with open('all_batting_urls', 'wb') as fp:
 #   pickle.dump(all_batting_urls, fp)

In [None]:
#with open ('all_batting_urls', 'rb') as fp:
 #   all_batting_urls = pickle.load(fp)

In [297]:
def make_all_columns_caps(df):
    df = df.columns.map(str.upper)


offense_df.columns = offense_df.columns.map(str.upper)
pitching_df.columns = pitching_df.columns.map(str.upper)

In [302]:
def set_team_year_index(df):
    df.set_index(['TEAM','YEAR'], inplace=True)

#set_team_year_index(offense_df)
#set_team_year_index(pitching_df)

In [303]:
def convert_number_columns(df):
    df.apply(pd.to_numeric, errors='ignore')

offense_df = offense_df.apply(pd.to_numeric, errors='ignore')
pitching_df = pitching_df.apply(pd.to_numeric, errors='ignore')