# Web Scraper

The purpose of this Jupyter Notebook is to scrape data from baseball-reference.com.
We are interested in the **Team Standard Batting, Team Standard Pitching** and **Team Fielding** tables for seasons dating back to 2006, along with the **Standings** tables, found by following the links below and editing the year information.

## Import Modules

In [58]:
from bs4 import BeautifulSoup, Comment
import urllib.request
import pandas as pd

## Standings from 2006 to 2017

In [2]:
soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/2017-standings.shtml').read(),"lxml")

tableStats = soup.find(id="all_expanded_standings_overall")
tableStats = tableStats.find_all(text=lambda text:isinstance(text, Comment))[0]
tableStats = BeautifulSoup(tableStats, "lxml").find(class_="sortable stats_table")

#print(tableStats)

In [10]:
#do it once. no headers.
f = open('/Users/areevesman/Documents/br_scraping/output/output.txt', 'w')
i = 1
for row in tableStats.find_all('tr'):
    #print(i)
    col = row.find_all('td')
    #print('len '+str(len(col)))
    if len(col) > 0:
        if i != 32:
            
            for j in range(0,22):
                if j == 0:
                    team = col[0].a.string.strip()
                    f.write(team+',')
                elif j > 0 and j < 21:
                    entry = col[j].string.strip()
                    f.write(entry + ',')
                else:
                    entry = col[j].string.strip()
                    f.write(entry)
            if i != 31:
                f.write('\n')
    i = i + 1
f.close()

In [5]:
#do it a bunch of times
for year in range(2006,2018):
    
    #print(year)
    
    #read in html code for year url
    soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/'+str(year)+'-standings.shtml').read(),"lxml")
    
    #get just the standings table
    tableStats = soup.find(id="all_expanded_standings_overall")
    tableStats = tableStats.find_all(text=lambda text:isinstance(text, Comment))[0]
    tableStats = BeautifulSoup(tableStats, "lxml").find(class_="sortable stats_table")
    
    #make a file to store standings data for the year
    f = open('/Users/areevesman/Documents/br_scraping/output/standings'+str(year)+'.csv', 'w')
    #write header to file
    f.write('team,league,games_played,wins,losses,win_loss_pct,r_per_game,ra_per_game,avg_run_diff,')
    f.write('sos,srs,pythWL,luck,inter,home,road,exInn,one_run,vRHP,vLHP,over_500,under_500'+'\n')
    
    #for each row, get entry for each column
    i = 1
    for row in tableStats.find_all('tr'):
        
        col = row.find_all('td')
        if len(col) > 0:
            if i != 32:

                for j in range(0,22):
                    if j == 0:
                        team = col[0].a.string.strip()
                        f.write(team+',')
                    elif j > 0 and j < 21:
                        entry = col[j].string.strip()
                        f.write(entry + ',')
                    else:
                        entry = col[j].string.strip()
                        f.write(entry)
                if i != 31:
                    f.write('\n')
        i = i + 1
    #close the year file
    f.close()

## Team Fielding

In [12]:
soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/'+str(year)+'.shtml').read(),"lxml")
    
#get just the standings table
tableStats = soup.find(id="all_teams_standard_fielding")
tableStats = tableStats.find_all(text=lambda text:isinstance(text, Comment))[0]
tableStats = BeautifulSoup(tableStats, "lxml").find(class_="sortable stats_table")

#print(tableStats)

In [13]:
#do it a bunch of times
for year in range(2006,2018):
    
    #print(year)
    
    #read in html code for year url
    soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/'+str(year)+'.shtml').read(),"lxml")
    
    #get just the standings table
    tableStats = soup.find(id="all_teams_standard_fielding")
    tableStats = tableStats.find_all(text=lambda text:isinstance(text, Comment))[0]
    tableStats = BeautifulSoup(tableStats, "lxml").find(class_="sortable stats_table")

    #make a file to store standings data for the year
    f = open('/Users/areevesman/Documents/br_scraping/output/fielding'+str(year)+'.csv', 'w')
    #write header to file
    f.write('team,num_fielders,field_ra_per_game,defEff,field_games_played,field_games_started,field_games_completed,')
    f.write('field_innings_played,defensive_chances,putouts,assists,field_errors,double_plays,')
    f.write('fielding_pct,field_rtot,field_rtot_per_year,field_rdrs,field_rdrs_per_year'+'\n')
    
    #for each row, get entry for each column
    i = 1
    
    for row in tableStats.find_all('tr')[1:]:
        
        if i != 31:

            first_col = row.find(scope="row")
            team = first_col.a.string.strip()
            f.write(team+',')

            col = row.find_all('td')
            if len(col) > 0:
                if i != 31:

                    for j in range(0,17):

                        if j < 16:
                            entry = col[j].string.strip()
                            f.write(entry + ',')
                        else:
                            entry = col[j].string.strip()
                            f.write(entry)
                            
                    if i != 30:
                        f.write('\n')
            i = i + 1
    #close the year file
    f.close()

## Team Standard Pitching

In [14]:
#do it a bunch of times
for year in range(2006,2018):
    
    #print(year)
    
    #read in html code for year url
    soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/'+str(year)+'.shtml').read(),"lxml")
    
    #get just the standings table
    tableStats = soup.find(id="all_teams_standard_pitching")
    tableStats = tableStats.find_all(text=lambda text:isinstance(text, Comment))[0]
    tableStats = BeautifulSoup(tableStats, "lxml").find(class_="sortable stats_table")

    #make a file to store standings data for the year
    f = open('/Users/areevesman/Documents/br_scraping/output/pitching'+str(year)+'.csv', 'w')
    #write header to file
    f.write('team,num_pitchers,pitching_avg_age,pitching_ra_per_game,pitching_wins,pitching_losses,')
    f.write('pitching_win_loss_pct,ERA,pitching_games_played,pitching_games_started,pitching_games_finished,')
    f.write('pitching_complete_game,team_shutouts,complete_game_shutouts,saves,innings_pitched,')
    f.write('pitching_hits_allowed,pitching_ra,pitching_earned_ra,pitching_hr_allowed,pitching_bases_on_walks,')
    f.write('pitching_intentional_bow,strikeouts,times_hit_by_pitch,balks,wild_pitches,batters_faced,')
    f.write('ERAp,fielding_independent_pitching,whip,h9,hr9,bb9,so9,so_per_walk,pitching_runners_lob'+'\n')
    
    #for each row, get entry for each column
    i = 1
    
    for row in tableStats.find_all('tr')[1:]:
        
        if i != 31:

            first_col = row.find(scope="row")
            team = first_col.a.string.strip()
            f.write(team+',')

            col = row.find_all('td')
            if len(col) > 0:
                if i != 31:

                    for j in range(0,35):

                        if j < 34:
                            entry = col[j].string.strip()
                            f.write(entry + ',')
                        else:
                            entry = col[j].string.strip()
                            f.write(entry)
                            
                    if i != 30:
                        f.write('\n')
            i = i + 1
    #close the year file
    f.close()

## Team Standard Batting

In [6]:
#do it a bunch of times
for year in range(2006,2018):
    
    #print(year)
    
    #read in html code for year url
    soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/leagues/MLB/'+str(year)+'.shtml').read(),"lxml")
    
    #get just the standings table
    tableStats = soup.find("table",id="teams_standard_batting")

    #make a file to store standings data for the year
    f = open('/Users/areevesman/Documents/br_scraping/output/batting'+str(year)+'.csv', 'w')
    #write header to file
    f.write('team,num_batters,batting_avg_age,batting_runs_per_game,batting_games_played,batting_plate_appearances,')
    f.write('batting_at_bats,batting_runs,batting_hits,batting_doubles,batting_triples,')
    f.write('batting_home_runs,batting_RBIs,batting_stolen_bases,batting_caught_stealing,')
    f.write('batting_bases_on_walks,batting_strikeouts,batting_ave,OBP,SLG,OBS_plus_SLG,OPS_plus,')
    f.write('batting_total_bases,batting_double_plays,batting_hit_by_pitch,sac_bunts,sac_flies,')
    f.write('batting_intentional_bow,batting_runners_lob'+'\n')
    
    #for each row, get entry for each column
    i = 1
    
    for row in tableStats.find_all('tr')[1:]:
        
        if i != 31:

            first_col = row.find(scope="row")
            team = first_col.a.string.strip()
            f.write(team+',')

            col = row.find_all('td')
            if len(col) > 0:
                if i != 31:

                    for j in range(0,28):

                        if j < 27:
                            entry = col[j].string.strip()
                            f.write(entry + ',')
                        else:
                            entry = col[j].string.strip()
                            f.write(entry)
                            
                    if i != 30:
                        f.write('\n')
            i = i + 1
    #close the year file
    f.close()

## Active Franchises

In [41]:
#read in html code for year url
soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/teams/').read(),"lxml")

#get just the standings table
tableStats = soup.find("table", id="teams_active").tbody

f = open('/Users/areevesman/Documents/br_scraping/output/active_franchises.csv', 'w')
f.write('team,from,to'+'\n')

i = 0
for row in tableStats.find_all('tr')[1:]:
    
    cols = row.find_all('td')
    team_col_almost = cols[0]
    
    if cols[0].a.__class__.__name__ != 'NoneType':
        
        i = i + 1

        team_col = team_col_almost.a.string.strip()
        from_col = cols[1].string.strip()
        to_col = cols[2].string.strip()
        
        f.write(team_col+',')
        f.write(from_col+',')
        if i != 30:
            f.write(to_col+'\n')
        else:
            f.write(to_col)
        
f.close()

## Team IDs

In [57]:
#read in html code for year url
soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/about/team_IDs.shtml').read(),"lxml")

#get just the standings table
tableStats = soup.find("table")

f = open('/Users/areevesman/Documents/br_scraping/output/teamIDs.csv', 'w')
f.write('Franchise_ID,Team_ID,Full_Team_Name,First_Year'+'\n')

for row in tableStats.find_all('tr')[1:]:
    
    cols = row.find_all('td')
    
    if cols[4].string.strip() == "Present":
        
        franchise_id = cols[0].string.strip()
        team_id = cols[1].string.strip()
        full_name = cols[2].string.strip()
        first_year = cols[3].string.strip()
        last_year = cols[4].string.strip()

        f.write(franchise_id+','+team_id+','+full_name+',')
        if franchise_id != "WSN":
            f.write(first_year+'\n')
        else:
            f.write(first_year)    
        
f.close()

## Team Game by Game Schedules

In [None]:
t = pd.read_csv()


#read in html code for year url
soup = BeautifulSoup(urllib.request.urlopen('https://www.baseball-reference.com/about/team_IDs.shtml').read(),"lxml")

#get just the standings table
tableStats = soup.find("table")

f = open('/Users/areevesman/Documents/br_scraping/output/teamIDs.csv', 'w')
f.write('Franchise_ID,Team_ID,Full_Team_Name,First_Year'+'\n')

for row in tableStats.find_all('tr')[1:]:
    
    cols = row.find_all('td')
    
    if cols[4].string.strip() == "Present":
        
        franchise_id = cols[0].string.strip()
        team_id = cols[1].string.strip()
        full_name = cols[2].string.strip()
        first_year = cols[3].string.strip()
        last_year = cols[4].string.strip()

        f.write(franchise_id+','+team_id+','+full_name+',')
        if franchise_id != "WSN":
            f.write(first_year+'\n')
        else:
            f.write(first_year)    
        
f.close()