In [41]:
import pandas, numpy
from matplotlib import pyplot
import scipy.stats
from collections import Counter
from ipywidgets import interact, interactive, fixed, interact_manual
from sklearn.linear_model import LinearRegression, LogisticRegression
from bs4 import BeautifulSoup
import requests
import csv

TEAM CODE:

Los Angeles Angels - 108

Houston Astros - 117

Oakland Athletics - 133

Toronto Blue Jays - 141

Atlanta Braves - 144

Milwaukee Brewers - 158

St. Louis Cardinals - 138

Chicago Cubs - 112

Arizona Diamondbacks - 109

Los Angeles Dodgers - 119

San Francisco Giants - 137

Cleveland Indians - 114

Seattle Mariners - 136

Miami Marlins - 146

New York Mets - 121

Washington Nationals - 120

Baltimore Orioles - 110

San Diego Padres - 135

Philadelphia Phillies - 143

Pittsburgh Pirates - 134

Texas Rangers - 140

Tampa Bay Rays - 139

Cincinnati Reds - 113

Boston Red Sox - 111

Colorado Rockies - 115

Kansas City Royals - 118

Detroit Tigers - 116

Minnesota Twins - 142

Chicago White Sox - 145

New York Yankees - 147

In [57]:
# Format for sample website:
# https://baseballsavant.mlb.com/team/147?view=gamelogs&nav=hitting&season=2016
# New York Yankees

# teamDict is a dictionary with our own assigned codes to each team
# This is based off alphabetical order of the team name, as opposed to city name

teamDict = {
    0: "Los Angeles Angels",
    1: "Houston Astros",
    2: "Oakland Athletics",
    3: "Toronto Blue Jays",
    4: "Atlanta Braves",
    5: "Milwaukee Brewers",
    6: "St. Louis Cardinals",
    7: "Chicago Cubs",
    8: "Arizona Diamondbacks",
    9: "Los Angeles Dodgers",
    10: "San Francisco Giants",
    11: "Cleveland Indians",
    12: "Seattle Mariners",
    13: "Miami Marlins",
    14: "New York Mets",
    15: "Washington Nationals",
    16: "Baltimore Orioles",
    17: "San Diego Padres",
    18: "Philadelphia Phillies",
    19: "Pittsburgh Pirates",
    20: "Texas Rangers",
    21: "Tampa Bay Rays",
    22: "Cincinnati Reds",
    23: "Boston Red Sox",
    24: "Colorado Rockies",
    25: "Kansas City Royals",
    26: "Detroit Tigers",
    27: "Minnesota Twins",
    28: "Chicago White Sox",
    29: "New York Yankees"
}

# linkDict is a dictionary linking each team's assigned code (from above)
# to their website code on baseballsavant.mlb.com
linkDict = {
    0: "https://baseballsavant.mlb.com/team/108?view=gamelogs&nav=hitting&season=2016",
    1: "https://baseballsavant.mlb.com/team/117?view=gamelogs&nav=hitting&season=2016",
    2: "https://baseballsavant.mlb.com/team/133?view=gamelogs&nav=hitting&season=2016",
    3: "https://baseballsavant.mlb.com/team/141?view=gamelogs&nav=hitting&season=2016",
    4: "https://baseballsavant.mlb.com/team/144?view=gamelogs&nav=hitting&season=2016",
    5: "https://baseballsavant.mlb.com/team/158?view=gamelogs&nav=hitting&season=2016",
    6: "https://baseballsavant.mlb.com/team/138?view=gamelogs&nav=hitting&season=2016",
    7: "https://baseballsavant.mlb.com/team/112?view=gamelogs&nav=hitting&season=2016",
    8: "https://baseballsavant.mlb.com/team/109?view=gamelogs&nav=hitting&season=2016",
    9: "https://baseballsavant.mlb.com/team/119?view=gamelogs&nav=hitting&season=2016",
    10: "https://baseballsavant.mlb.com/team/137?view=gamelogs&nav=hitting&season=2016",
    11: "https://baseballsavant.mlb.com/team/114?view=gamelogs&nav=hitting&season=2016",
    12: "https://baseballsavant.mlb.com/team/136?view=gamelogs&nav=hitting&season=2016",
    13: "https://baseballsavant.mlb.com/team/146?view=gamelogs&nav=hitting&season=2016",
    14: "https://baseballsavant.mlb.com/team/121?view=gamelogs&nav=hitting&season=2016",
    15: "https://baseballsavant.mlb.com/team/120?view=gamelogs&nav=hitting&season=2016",
    16: "https://baseballsavant.mlb.com/team/110?view=gamelogs&nav=hitting&season=2016",
    17: "https://baseballsavant.mlb.com/team/135?view=gamelogs&nav=hitting&season=2016",
    18: "https://baseballsavant.mlb.com/team/143?view=gamelogs&nav=hitting&season=2016",
    19: "https://baseballsavant.mlb.com/team/134?view=gamelogs&nav=hitting&season=2016",
    20: "https://baseballsavant.mlb.com/team/140?view=gamelogs&nav=hitting&season=2016",
    21: "https://baseballsavant.mlb.com/team/139?view=gamelogs&nav=hitting&season=2016",
    22: "https://baseballsavant.mlb.com/team/113?view=gamelogs&nav=hitting&season=2016",
    23: "https://baseballsavant.mlb.com/team/111?view=gamelogs&nav=hitting&season=2016",
    24: "https://baseballsavant.mlb.com/team/115?view=gamelogs&nav=hitting&season=2016",
    25: "https://baseballsavant.mlb.com/team/118?view=gamelogs&nav=hitting&season=2016",
    26: "https://baseballsavant.mlb.com/team/116?view=gamelogs&nav=hitting&season=2016",
    27: "https://baseballsavant.mlb.com/team/142?view=gamelogs&nav=hitting&season=2016",
    28: "https://baseballsavant.mlb.com/team/145?view=gamelogs&nav=hitting&season=2016",
    29: "https://baseballsavant.mlb.com/team/147?view=gamelogs&nav=hitting&season=2016"
}

# This list will be filled in with data of each team.
# The index of this array will correspond to the team listed in teamDict.
# Using teamDict is how this will be decoded.

In [52]:
# At this point, print(allTeamData[0][0]) prints:
# [<tr class="default-table-row" style="background-color:  #e6f9e6;">
# <td class="tr-data align-left "> <span><a href="/gamefeed?game_pk=449287">2016-10-02</a></span></td>
# <td class="tr-data align-left "> <span>Astros</span></td>
# <td>0</td>
# <td>0.0</td>
# <td>108.9</td>
# <td>86.8</td>
# <td>24.1</td>
# <td>0.254</td>
# <td>0.358</td>
# <td>0.326</td>
# <td>0.366</td>
# <td>32.1</td>
# This is accurate for the first game of team 0 (Angels) season vs the Astros.
# To do: Iterate through all the teams.
# While iterating through all the teams, iterate through all the games.
# During this, the first <td class="tr-data align-left "> <span> holds the date
# and the next holds the opposing team.
# All the following td's hold data in the following order:
# Barrels; Barrel %; Max EV; Avg EV; Launch Angle; XBA; XSLG; XWOBA; WOBA; Hard Hit %

# Initializing lists
date = []
opponent = []
barrels = []
barrelPercent = []
maxEV = []
avgEV = []
launchAngle = []
xBA = []
xSLG = []
xWOBA = []
WOBA = []
hardHitPercent = []

master = []

for i in range(30): # For all 30 teams
    # Initialize soup and data
    soup = BeautifulSoup(requests.get(linkDict[i]).content, 'lxml')
    print(i)
    
    data_div = soup.find('div', attrs={'class':'gamelogs-group', 'id':'gamelogsHitting'})
    data_class = data_div.contents
    data_class = str(data_class[7])
    
    # Initialize/reset the value arrays for each team
    team_date = []
    team_opponent = []
    team_barrels = []
    team_barrelPercent = []
    team_maxEV = []
    team_avgEV = []
    team_launchAngle = []
    team_xBA = []
    team_xSLG = []
    team_xWOBA = []
    team_WOBA = []
    team_hardHitPercent = []

    data_table = data_class.split('<td class="tr-data align-left "> <span><a href="/gamefeed?game_pk=')[1:]
    for j in range(len(data_table)):
        data_table[j] = data_table[j].split('</td>')[:-1]
        # Find date
        data_table[j][0] = data_table[j][0].split('</a>')[0][-10:]
        team_date.append(data_table[j][0])
        # Find opponent
        data_table[j][1] = data_table[j][1].split('<span>')[1].split('</span>')[0]
        team_opponent.append(data_table[j][1])
        # Find barrels
        data_table[j][2] = data_table[j][2].split('<td>')[1]
        team_barrels.append(data_table[j][2])
        # Find barrel percent
        data_table[j][3] = data_table[j][3].split('<td>')[1]
        team_barrelPercent.append(data_table[j][3])
        # Find max EV
        data_table[j][4] = data_table[j][4].split('<td>')[1]
        team_maxEV.append(data_table[j][4])
        # Find avg EV
        data_table[j][5] = data_table[j][5].split('<td>')[1]
        team_avgEV.append(data_table[j][5])
        # Find launch angle
        data_table[j][6] = data_table[j][6].split('<td>')[1]
        team_launchAngle.append(data_table[j][6])
        # Find xBA
        data_table[j][7] = data_table[j][7].split('<td>')[1]
        team_xBA.append(data_table[j][7])
        # Find xSLG
        data_table[j][8] = data_table[j][8].split('<td>')[1]
        team_xSLG.append(data_table[j][8])
        # Find xWOBA
        data_table[j][9] = data_table[j][9].split('<td>')[1]
        team_xWOBA.append(data_table[j][9])
        # Find WOBA
        data_table[j][10] = data_table[j][10].split('<td>')[1]
        team_WOBA.append(data_table[j][10])
        # Find Hard Hit %
        data_table[j][11] = data_table[j][11].split('<td>')[1]
        team_hardHitPercent.append(data_table[j][11])
        
    # Append each team value array onto the master array
    # Each index of each master array corresponds to its team index as seen 
    # in teamDict
    date.append(team_date)
    opponent.append(team_opponent)
    barrels.append(team_barrels)
    barrelPercent.append(team_barrelPercent)
    maxEV.append(team_maxEV)
    avgEV.append(team_avgEV)
    launchAngle.append(team_launchAngle)
    xBA.append(team_xBA)
    xSLG.append(team_xSLG)
    xWOBA.append(team_xWOBA)
    WOBA.append(team_WOBA)
    hardHitPercent.append(team_hardHitPercent)
    
    master.append(data_table)

print("DONE")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
DONE


In [53]:
print(master)

[[['2016-10-02', 'Astros', '0', '0.0', '108.9', '86.8', '24.1', '0.254', '0.358', '0.326', '0.366', '32.1'], ['2016-10-01', 'Astros', '1', '4.0', '107.0', '81.1', '10.8', '0.192', '0.315', '0.258', '0.187', '36.0'], ['2016-09-30', 'Astros', '1', '4.3', '110.3', '82.6', '22.0', '0.222', '0.353', '0.298', '0.280', '39.1'], ['2016-09-28', 'Athletics', '1', '3.3', '106.5', '87.2', '10.3', '0.252', '0.403', '0.332', '0.371', '43.3'], ['2016-09-27', 'Athletics', '2', '6.5', '104.8', '85.1', '7.8', '0.297', '0.499', '0.350', '0.425', '32.3'], ['2016-09-26', 'Athletics', '1', '4.0', '110.1', '86.9', '12.1', '0.265', '0.410', '0.318', '0.264', '52.0'], ['2016-09-25', 'Astros', '0', '0.0', '106.7', '82.7', '17.4', '0.209', '0.265', '0.235', '0.264', '30.8'], ['2016-09-24', 'Astros', '1', '3.7', '108.0', '80.7', '21.5', '0.234', '0.321', '0.282', '0.344', '29.6'], ['2016-09-23', 'Astros', '0', '0.0', '106.0', '85.3', '8.9', '0.297', '0.383', '0.351', '0.401', '37.9'], ['2016-09-22', 'Astros', '0'

In [56]:
with open('baseball_savant_2016.csv', 'w', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerow(['Date', 'Opponent', 'Barrels', 'Barrel %', 'Max EV',
                'Avg EV', 'Launch Angle', 'xBA', 'xSLG', 'xWOBA', 'WOBA',
                'Hard Hit %'])
    for k in range(30):
        for m in range(len(master[k])):
            wr.writerow([date[k][m], opponent[k][m], barrels[k][m],
                          barrelPercent[k][m], maxEV[k][m], avgEV[k][m],
                          launchAngle[k][m], xBA[k][m], xSLG[k][m], 
                          xWOBA[k][m], WOBA[k][m], hardHitPercent[k][m]])