## Data Scrapping Notebook

### Importing required libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
from collections import Counter 
from selenium import webdriver
import time
from datetime import datetime
import os

### Defining date ranges to collect data for each season
<b><u>Note:</u></b> Creating a master array of pf.date_range to loop over to collect data from different websites

In [None]:
dateRanges = []

dates2019 = pd.date_range(start="2019-04-13",end="2019-10-20")
dateRanges.append(dates2019)
dates2018 = pd.date_range(start="2018-03-30",end="2018-10-28")
dateRanges.append(dates2018)
dates2017 = pd.date_range(start="2017-04-02",end="2017-11-01")
dateRanges.append(dates2017)
dates2016 = pd.date_range(start="2016-04-03",end="2016-11-02")
dateRanges.append(dates2016)
dates2015 = pd.date_range(start="2015-04-04",end="2015-11-01")
dateRanges.append(dates2015)

## Data Scrapping for Savant Website

### Function to get extract the table from the wepage
<b><u>Note:</u></b> 
- The source code contains the player's inning by inning data in a table format
- Some players had empty cells  for certain feilds that were replaced with NA
- Some feilds like BA with svgs that served as an identifier in filling in NA or not

In [11]:
def getTableFromPage(table, headerRow):
    headerColumns = headerRow.find_all("th")
    bodyBlocks = table.find("tbody")
    bodyRows = bodyBlocks.find_all("tr")
        
    table_contents = []   
    header_cells=[]
    for th in headerColumns:
        cleanedText = th.getText().strip()
        if  cleanedText != '':
            header_cells.append(cleanedText)
    if(len(header_cells) != 0):
        table_contents += [header_cells]
        
        
    for tr in bodyRows:
        row_cells=[] #make empty list to append cells in the row to
    
        for td in tr.find_all('td'): # find all the cells in the row that are labled as data cells
            cleanedColumnValue = td.getText().strip()
            if cleanedColumnValue != '': #remove leading and trailing characters and then check if the string is empty
                row_cells.append(cleanedColumnValue)
            elif td.find("svg") == None and td.find("img") == None:
                row_cells.append("NA")
        
        if len(row_cells) != 0: 
            table_contents += [ row_cells ] # if the length of the row_cells is larger than one append it to the tottal list

    df = pd.DataFrame(table_contents[1:],columns=table_contents[0])
    return df

### Function to get all same day game url from the source code

In [11]:
def getURLList (dateRange):
    url = "https://baseballsavant.mlb.com/gamefeed?game_pk=567511&game_date" + str(dateRange.date())
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)

    browser.get(url)
    time.sleep(3)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    urlDiv = soup.find("div", {"class": "flex highlight-game"})
    getAllGames =soup.find("div", {"class": "flex-container"})
    gameUrlsList = []
    if getAllGames != None:
        gameLinks = getAllGames.find_all("a")
        for link in gameLinks:
            gameUrlsList.append("https://baseballsavant.mlb.com" + link["href"])

    browser.close()
    browser.quit()
    return gameUrlsList


### Function to create the csv file for each game on each day

In [99]:
def getCSVForDateUrl (url, dateVal, number):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)

    browser.get(url)
    time.sleep(8)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")

    div = soup.find("div", {"id": "exitVelocity"})
    table  = div.find("table")
    header = table.find("thead")
    headerRow = header.find("tr", attrs={"class":"tr-component-row"})

    df = getTableFromPage(table, headerRow)

    browser.close()
    browser.quit()
   
    
    newpath = r'C:\Users\Public\Savant\%s' %str(dateVal.date())
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    uniqueId = str(dateVal.date()) +"_"+ str(number)
    filePath = newpath + r'\Savant_%s.csv' %uniqueId
    df.to_csv(filePath)


### Code to loop over evey day

In [100]:
exceptionArray = []

for dateRange in dateRanges:
    for d in dateRange:
        urlList = getURLList(d)
        if len(urlList) !=0:
            for url in urlList:
                try:
                    getCSVForDateUrl(url, d, urlList.index(url))                 
                except Exception as ex:
                    exceptionArray.append(str(ex)+ "    Url: " + url + "  for date: " + str(d))

dfTest = pd.DataFrame(exceptionArray)
dfTest.to_csv(r'C:\Users\Public\Savant\SavantError.csv')    
            

  browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
  browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)


## Data Scrapping for Baseball-Reference Website

### Getting All Links for Batter Box Office Data

In [2]:
def GetAllBoxOfficeLinksForYear(year):
    mlbStatsURL = 'https://www.baseball-reference.com/leagues/MLB/%s-schedule.shtml'%year
    page = requests.get(mlbStatsURL)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find_all("em")
    contentList = []
    for link in table:
        if link.find("a"):
            contentList.append("https://www.baseball-reference.com" + link.find("a")["href"])

    return contentList

### Getting Pitcher ERA Data

In [None]:
years = ["2015", "2016", "2017", "2018", "2019"]
for y in years:
    exceptionPitcherEraArray = []
    urlList = GetAllBoxOfficeLinksForYear(y)
       
    if len(urlList) !=0:
        pitcherERA = []
        for url in urlList:
            try:
                pitcherERA.append(GetCSVForYearData(url)) 
                
            except Exception as ex:
                print(ex)
                print(url)
                print(y)
                exceptionPitcherEraArray.append(str(ex) + " Url: " + url + "  for year: " + y)
            
        newpath = r'C:\Users\Public\PitcherBoxOffice'
        if not os.path.exists(newpath):
            os.makedirs(newpath)
            
        masterDf = pd.concat(pitcherERA)
        filePath = newpath + r'\PictherERA_%s.csv' %y
        masterDf.to_csv(filePath)

        dfExceptBO = pd.DataFrame(exceptionPitcherEraArray)
        dfExceptBO.to_csv(r'C:\Users\Public\PitcherBoxOffice\PitcherEraError_%s.csv' %y)    

In [3]:
def getPitcherEraDataBoxOfficePage(table, date):
    
    tableContents = []   

    headerRow = table.find("thead")  
    headerCells=[]
    headerCells.append(headerRow.find("th", attrs={"data-stat":"player" }).getText())
    headerCells.append(headerRow.find("th", attrs={"data-stat":"earned_run_avg" }).getText())
    headerCells.append("Date")


    if(len(headerCells) != 0):
        tableContents += [headerCells]
        
        
    bodyBlocks = table.find("tbody")
    bodyRows = bodyBlocks.find_all("tr")
    rowCells=[]
    rowCells.append(bodyRows[0].find("th").find("a").getText())
    rowCells.append(bodyRows[0].find("td", attrs={"data-stat":"earned_run_avg"}).getText())
    if len(rowCells) != 0: 
        rowCells.append(date)
        tableContents += [ rowCells ] # if the length of the row_cells is larger than one append it to the tottal list

    df = pd.DataFrame(tableContents[1:],columns=tableContents[0])
    return df

In [4]:
def GetCSVForYearData(url):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
    browser.get(url)
    time.sleep(8)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    tablesList = soup.find_all("table", attrs={"class": "sortable stats_table min_width shade_zero now_sortable sticky_table re1 le1"})

    tablesList.pop(0)
    tablesList.pop(0)

    picherTableList = []
    dateString = re.findall(r'\d+',url)
    dateStringCleaned = dateString[0][:-1]
    
    date = datetime.strptime(dateStringCleaned, '%Y%m%d')

    for table in tablesList:
        picherTableList.append(getPitcherEraDataBoxOfficePage(table, str(date.date())))
    
    browser.close()
    browser.quit()
        
    return pd.concat(picherTableList)
    

In [None]:
years = ["2015", "2016", "2017", "2018", "2019"]
for y in years:
    exceptionPitcherEraArray = []
    urlList = GetAllBoxOfficeLinksForYear(y)
       
    if len(urlList) !=0:
        pitcherERA = []
        for url in urlList:
            try:
                pitcherERA.append(GetCSVForYearData(url)) 
                
            except Exception as ex:
                print(ex)
                print(url)
                print(y)
                exceptionPitcherEraArray.append(str(ex) + " Url: " + url + "  for year: " + y)
            
        newpath = r'C:\Users\Public\PitcherBoxOffice'
        if not os.path.exists(newpath):
            os.makedirs(newpath)
            
        masterDf = pd.concat(pitcherERA)
        filePath = newpath + r'\PictherERA_%s.csv' %y
        masterDf.to_csv(filePath)

        dfExceptBO = pd.DataFrame(exceptionPitcherEraArray)
        dfExceptBO.to_csv(r'C:\Users\Public\PitcherBoxOffice\PitcherEraError_%s.csv' %y)    

### Getting Box Office Data for Batter

In [3]:
def getTableForBoxOfficePage(table):
    teamName = table.find("caption").getText()
    header = table.find("thead")
    headerRow = header.find("tr")
    headerColumns = headerRow.find_all("th")
    bodyBlocks = table.find("tbody")
    bodyRows = bodyBlocks.find_all("tr")
        
    table_contents = []   
    header_cells=[]
    for th in headerColumns:
        cleanedText = th.getText().strip()
        if  cleanedText != '':
            header_cells.append(cleanedText)
    header_cells.append("Team")
    if(len(header_cells) != 0):
        table_contents += [header_cells]
        
        
    for tr in bodyRows:
        row_cells=[] #make empty list to append cells in the row to
        
        if tr.find('th').getText(): #find all cells that are labled as headers in the rows 1 to n and chee
            row_cells.append(tr.find('th').find("a").getText()) # append any text in these header cells to the list
    
    
        for td in tr.find_all('td'): # find all the cells in the row that are labled as data cells
            cleanedColumnValue = td.getText().strip()
            if cleanedColumnValue != '': #remove leading and trailing characters and then check if the string is empty
                row_cells.append(cleanedColumnValue)
            elif td.find("svg") == None and td.find("img") == None:
                row_cells.append("NA")
        
        if len(row_cells) != 0: 
            row_cells.append(teamName)
            table_contents += [ row_cells ] # if the length of the row_cells is larger than one append it to the tottal list

    df = pd.DataFrame(table_contents[1:],columns=table_contents[0])
    return df

In [4]:
def GetCSVForMLBBoxOfficeGame(url, year):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
    browser.get(url)
    time.sleep(8)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    uniqueId = soup.find("h1").getText()
    tablesList = soup.find_all("table", attrs = {"class": "sortable stats_table min_width shade_zero now_sortable sticky_table re1 le1"})
    listt = []
    for table in tablesList:
        listt.append(getTableForBoxOfficePage(table))
    
    df = pd.concat([listt[0],listt[1]])
    
    browser.close()
    browser.quit()
        
    
    newpath = r'C:\Users\Public\BoxOffice\%s' %year
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    
    filePath = newpath + r'\BoxOffice_%s.csv' %uniqueId
    df.to_csv(filePath)
   

In [2]:
def getCSVForEachDate(dateVal, url, year):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
    browser.get(url)
    time.sleep(8)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    
    
    gamesDivList = soup.find_all("div", attrs={"class":"starting-lineups__matchup"})
    gamesDivListDraftKing = soup.find_all("div", attrs={"class":"gamesDivList"})
    gameList = gamesDivList + gamesDivListDraftKing
    if len(gameList) == 0:
        print("nolen" + dateVal)
        return None

    dfList = []
    for gameDiv in gameList:
        df = getTablePitcherBatterMatchUpForGame(gameDiv, dateVal)
        dfList.append(df)
    
    masterdf = pd.concat(dfList)
    browser.close()
    browser.quit()
    
    newpath = r'C:\Users\Public\MatchUp\%s' %year
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    
    filePath = newpath + r'\PitcherBatterMatchUp_%s.csv' %dateVal
    masterdf.to_csv(filePath)

In [None]:
exceptionBoxOfficeArray = []
years = ["2015", "2016", "2017", "2018", "2019"]
years = ["2015"]

for y in years:
    urlList = GetAllBoxOfficeLinksForYear(y)
    i = 0
    while i< 316:
        urlList.pop(0)
        i+=1
    
    if len(urlList) !=0:
        for url in urlList:
            try:
                GetCSVForMLBBoxOfficeGame(url, y)  
            except Exception as ex:
                exceptionBoxOfficeArray.append(str(ex) + " Url: " + url + "  for year: " + y)

    dfExceptBO = pd.DataFrame(exceptionBoxOfficeArray)
    dfExceptBO.to_csv(r'C:\Users\Public\BoxOffice\BoxOfficeError_%s.csv', y)    

## Data Scrapping for SwishAnalytics Website

In [12]:
def getTablePitcherBatterMatchUpForGameV2(div, dateVal):
   
    teamNamesInSingleString = div.find("h4" , attrs={ "class": "lato inline vert-mid bold"}).getText()
    teamNames = re.findall(r'\w+', teamNamesInSingleString) 
    
   
    awayPitcherDiv = div.find("div" , attrs={ "class": "mar-right-10"}).getText()
    homePictherDiv = div.find("div" , attrs={ "class": "mar-left-10"}).getText()
     
    
    awayPitcherInfo = awayPitcherDiv.split(')')
    PitcherNameAway = awayPitcherInfo[1].strip()
    PitcherHandAway = re.sub('[()]', '', awayPitcherInfo[0])
    
    homePitcherInfo = homePictherDiv.split('(')
    PitcherNameHome = homePitcherInfo[0].strip()
    PitcherHandHome = re.sub('[()]', '', homePitcherInfo[1])
     
          

    StartingLineUps = div.find_all("tbody" , attrs={"class":"helvetica"})
   
        
    table_contents = []   
    header_cells=["Team" , "LineUpPosition", "Batter", "BatterHand", "PitcherFromTheOtherTeam", "PitcherHand", "Date", "Home/Away","Standium","TeamMatchUp"]
    table_contents += [header_cells]
    
    
    battersDivAway = div.find("table", attrs={"table table-condensed text-left mar-top-0 mar-bottom-0 pad-bottom-0 table-roster"})
    battersListAway = battersDivAway.find_all("tr")
        
    batterDivHome = div.find("table", attrs={"class": "table table-condensed text-right mar-top-0 mar-bottom-0 pad-bottom-0 table-roster"})
    battersListHome = batterDivHome.find_all("tr")
    awayBatterLineUpPosition=1
    for batter in battersListAway:
        row_cells=[] 
        row_cells.append(teamNames[0])
        row_cells.append(awayBatterLineUpPosition)
        row_cells.append(re.sub(r"[^. a-zA-Z0-9]+", ' ', batter.find("b").next_sibling))
        row_cells.append(re.sub(r"[^a-zA-Z0-9]+", ' ',batter.find("small").getText().strip()))
        row_cells.append(PitcherNameHome)
        row_cells.append(PitcherHandHome)
        row_cells.append(dateVal)
        row_cells.append("Away")
        row_cells.append(teamNames[1])
        row_cells.append(re.sub(r"[^@ a-zA-Z0-9]+", ' ', teamNamesInSingleString))
            #print(row_cells)
        if len(row_cells) != 0: 
            table_contents += [ row_cells ]
        awayBatterLineUpPosition+=1
        
    homeBatterLineUpPosition=1  
    for batter in battersListHome:
        row_cells=[] 
        row_cells.append(teamNames[1])
        row_cells.append(homeBatterLineUpPosition)
        mutedTextTag = batter.select("small[class=text-muted]")
        if len(mutedTextTag) == 1:
            row_cells.append(re.sub(r"[^. a-zA-Z0-9]+", ' ', mutedTextTag[0].next_sibling))
            row_cells.append(re.sub(r"[^a-zA-Z0-9]+", ' ', mutedTextTag[0].getText().strip()))
        else:
            row_cells.append(re.sub(r"[^. a-zA-Z0-9]+", ' ', mutedTextTag[1].next_sibling))
            row_cells.append(re.sub(r"[^a-zA-Z0-9]+", ' ',mutedTextTag[1].getText().strip()))
        
        row_cells.append(PitcherNameAway)
        row_cells.append(PitcherHandAway)
        row_cells.append(dateVal)
        row_cells.append("Home")
        row_cells.append(teamNames[1])
        row_cells.append(re.sub(r"[^@ a-zA-Z0-9]+", ' ', teamNamesInSingleString))
            #print(row_cells)
        if len(row_cells) != 0: 
            table_contents += [ row_cells ]
        homeBatterLineUpPosition+=1
            

              
        

    df = pd.DataFrame(table_contents[1:],columns=table_contents[0])
    
    return df

In [13]:
def getCSVForEachDateV2(dateVal, url, year):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
    browser.get(url)
    time.sleep(8)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    gameDivs = soup.find_all("div", attrs = {"class": "col-md-6"})
    
    if len(gameDivs) == 0:
        print("nolen" + dateVal)
        return None

    dfList = []
    for div in gameDivs:
        df = getTablePitcherBatterMatchUpForGameV2(div, dateVal)
        dfList.append(df)
    
    masterdf = pd.concat(dfList)
    browser.close()
    browser.quit()
    
    newpath = r'C:\Users\Public\MatchUp\%s' %year
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    
    filePath = newpath + r'\PitcherBatterMatchUp_%s.csv' %dateVal
    #filePath = r'C:\Users\Public\MatchUp\test.csv'
    masterdf.to_csv(filePath)

In [14]:
exceptionArray = []

for dateRange in dateRanges:
    for d in dateRange:
        url = "https://swishanalytics.com/optimus/mlb/lineups?date=" + str(d.date())
        try:
            getCSVForEachDateV2(str(d.date()),url, str(d.year))              
        except Exception as ex:
            print(str(ex) + url)
            exceptionArray.append(str(ex)+ "    Url: " + url + "  for date: " + str(d.date()))

dfTest = pd.DataFrame(exceptionArray)
errorFilePath =r'C:\Users\Public\MatchUp\MatchUpError_%s.csv' %str(d.year)
dfTest.to_csv(errorFilePath)    
            

  browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)


## Data Scrapping for Baseball-reference website (Only Player Birthday)

In [None]:
def GetCSVForMLBBirthday(url, month, day):
    browser = webdriver.Chrome(r"C:\Users\sarit\Downloads\chromedriver_win32\chromedriver", chrome_options=options)
    browser.get(url)
    time.sleep(13)
    html = browser.page_source
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table", attrs = {"id":"birthday_stats"})

    df = getMLBPlayerBirthdayTable(table)
        
    browser.close()
    browser.quit()
        
    
    newpath = r'C:\Users\Public\Birthday\%s' %month
    if not os.path.exists(newpath):
        os.makedirs(newpath)
        
    date= month+"-"+day
    filePath = newpath + r'\BirthdayOn_%s.csv' %date
    df.to_csv(filePath)
   

In [None]:
def getMLBPlayerBirthdayTable(table):
    header = table.find("thead")
    headerRow = header.find("tr")
    bodyBlocks = table.find("tbody")
    bodyRows = bodyBlocks.find_all("tr")
        
    table_contents = []   
    header_cells=[]
    header_cells.append(headerRow.find("th", attrs={"data-stat":"player"}).getText().strip())
    header_cells.append(headerRow.find("th", attrs={"data-stat":"birth_year"}).getText().strip())
    header_cells.append(headerRow.find("th", attrs={"data-stat":"experience"}).getText().strip())
    header_cells.append(headerRow.find("th", attrs={"data-stat":"year_min"}).getText().strip())
    header_cells.append(headerRow.find("th", attrs={"data-stat":"year_max"}).getText().strip())
                            
    if(len(header_cells) != 0):
        table_contents += [header_cells]
        
        
    for tr in bodyRows:
        row_cells=[] #make empty list to append cells in the row to
        lastyear = tr.find("td", attrs={"data-stat":"year_max"})
        if(lastyear == None):
            continue
        if (int(lastyear.getText().strip()) < 2015):
            continue
        row_cells.append(tr.find("td", attrs={"data-stat":"player"}).getText().strip())
        row_cells.append(tr.find("td", attrs={"data-stat":"birth_year"}).getText().strip())
        row_cells.append(tr.find("td", attrs={"data-stat":"experience"}).getText().strip())
        row_cells.append(tr.find("td", attrs={"data-stat":"year_min"}).getText().strip())
        row_cells.append(tr.find("td", attrs={"data-stat":"year_max"}).getText().strip())
                
        if len(row_cells) != 0: 
            table_contents += [ row_cells ] # if the length of the row_cells is larger than one append it to the tottal list

    df = pd.DataFrame(table_contents[1:],columns=table_contents[0])
    return df

In [None]:
exceptionArray = []
months = list(map(str,range(1,12)))
daysOfYear = pd.date_range(start="2020-01-1",end="2020-12-31")

for day in daysOfYear:
    url = "https://www.baseball-reference.com/friv/birthdays.cgi?month=" + str(day.month) + "&day=" + str(day.day)
    try:
        GetCSVForMLBBirthday(url, str(day.month), str(day.day))
    except Exception as ex:
        print(str(ex) + url)
        exceptionArray.append(str(ex)+ "    Url: " + url + "  for date: " + str(day))
        
dfError = pd.DataFrame(exceptionArray)
errorFilePath =r'C:\Users\Public\Birthday\Birhdays.csv'
dfError.to_csv(errorFilePath)    
            