In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
import itertools

## This code collects links to team pages and then links to player pages

In [3]:
# I record links to each 2023 MLB team from the homepage of baseball-reference.com
soup = BeautifulSoup(requests.get('https://www.baseball-reference.com/').content)
team_links_raw = soup.find('div', class_='data_grid section_wrapper').find_all('a', {'href': True})

# previous season team links are identical to current team links other than the year
team_links_2022 = ['https://www.baseball-reference.com' + link_raw['href'].replace('2023', '2022') for link_raw in team_links_raw]
team_links_2021 = ['https://www.baseball-reference.com' + link_raw['href'].replace('2023', '2021') for link_raw in team_links_raw]

In [15]:
# I record whether the player is a batter or a pitcher in addition to the player link because game log tables look different
# for batters and pitchers
player_links_2022 = []
is_batter_2022 = []
'''player_links_2021 = []
is_batter_2021 = []'''
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

'''for link in team_links_2021:
    driver.get(link)
    soup = BeautifulSoup(driver.page_source)
    tr_sections = soup.find('div', id='all_appearances').find_all('tr', {'data-row': True})
    tr_sections = [tr for tr in tr_sections if tr.find('a', {'href': True})]
    player_ids_raw = [tr.find('a', {'href': True})['href'] for tr in tr_sections]
    player_ids = [re.search('(?<=\/)[a-z0-9]*(?=\.shtml)', id_raw).group() for id_raw in player_ids_raw]
    is_batter = [int(tr.find('td', {'data-stat': 'G_batting'}).text) != 0 for tr in tr_sections]
    game_log_links = ([f'https://www.baseball-reference.com/players/gl.fcgi?id={id}&t=b&year=2021' if is_batter
                  else f'https://www.baseball-reference.com/players/gl.fcgi?id={id}&t=p&year=2021'
                  for id, is_batter in zip(player_ids, is_batter)])
    player_links_2021.append(game_log_links)
    is_batter_2021.append(is_batter)'''
    
for link in team_links_2022:
    driver.get(link)
    soup = BeautifulSoup(driver.page_source)
    tr_sections = soup.find('div', id='all_appearances').find_all('tr', {'data-row': True})
    tr_sections = [tr for tr in tr_sections if tr.find('a', {'href': True})]
    player_ids_raw = [tr.find('a', {'href': True})['href'] for tr in tr_sections]
    player_ids = [re.search('(?<=\/)[a-z0-9]*(?=\.shtml)', id_raw).group() for id_raw in player_ids_raw]
    is_batter = [int(tr.find('td', {'data-stat': 'G_batting'}).text) != 0 for tr in tr_sections]
    game_log_links = ([f'https://www.baseball-reference.com/players/gl.fcgi?id={id}&t=b&year=2022' if is_batter
                  else f'https://www.baseball-reference.com/players/gl.fcgi?id={id}&t=p&year=2022'
                  for id, is_batter in zip(player_ids, is_batter)])
    player_links_2022.append(game_log_links)
    is_batter_2022.append(is_batter)

driver.close()

In [19]:
'''player_links_2021 = list(itertools.chain.from_iterable(player_links_2021))
is_batter_2021 = list(itertools.chain.from_iterable(is_batter_2021))'''

player_links_2022 = list(itertools.chain.from_iterable(player_links_2022))
is_batter_2022 = list(itertools.chain.from_iterable(is_batter_2022))

In [20]:
links_2022 = pd.DataFrame(list(zip(player_links_2022, is_batter_2022)), columns=['Link', 'Is_Batter'])
links_2022.to_csv('player_links_2022.csv')

## This code downloads player data given player links

In [2]:
links_2022 = pd.read_csv('player_links_2022.csv')

In [8]:
current_index = 0
players = {}
counter = itertools.count(0)
player_count = len(links_2022['Link'])

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

for i, (link, is_batter) in enumerate(zip(links_2022['Link'], links_2022['Is_Batter'])):
    print(f'On player {i} of {player_count}')
    driver.get(link)
    soup = BeautifulSoup(driver.page_source)
    name = soup.find('div', id='meta').find('span').text
    if is_batter: 
        table = soup.find('table', id='batting_gamelogs')
        rows = table.find_all('tr', id=re.compile('batting_gamelogs.\d+'))
        perf = [row.find('td', {'data-stat': 'onbase_plus_slugging'}).text for row in rows]

    else:
        table = soup.find('table', id='pitching_gamelogs')
        rows = table.find_all('tr', id=re.compile('pitching_gamelogs.\d+'))
        perf = [row.find('td', {'data-stat': 'fip'}).text for row in rows]

    date = [row.find('td', {'data-stat': 'date_game'}).text.split(' (', 1)[0].replace('\xa0', ' ') for row in rows]
    team = [row.find('td', {'data-stat': 'team_ID'}).text for row in rows]
    player_dict = {next(counter): [name, is_batter, date_, team_, perf_] for date_, team_, perf_ in zip(date, team, perf)}
    players.update(player_dict)
driver.close()

On player 0 of 1683
On player 1 of 1683
On player 2 of 1683
On player 3 of 1683
On player 4 of 1683
On player 5 of 1683
On player 6 of 1683
On player 7 of 1683
On player 8 of 1683
On player 9 of 1683
On player 10 of 1683
On player 11 of 1683
On player 12 of 1683
On player 13 of 1683
On player 14 of 1683
On player 15 of 1683
On player 16 of 1683
On player 17 of 1683
On player 18 of 1683
On player 19 of 1683
On player 20 of 1683
On player 21 of 1683
On player 22 of 1683
On player 23 of 1683
On player 24 of 1683
On player 25 of 1683
On player 26 of 1683
On player 27 of 1683
On player 28 of 1683
On player 29 of 1683
On player 30 of 1683
On player 31 of 1683
On player 32 of 1683
On player 33 of 1683
On player 34 of 1683
On player 35 of 1683
On player 36 of 1683
On player 37 of 1683
On player 38 of 1683
On player 39 of 1683
On player 40 of 1683
On player 41 of 1683
On player 42 of 1683
On player 43 of 1683
On player 44 of 1683
On player 45 of 1683
On player 46 of 1683
On player 47 of 1683
On

On player 378 of 1683
On player 379 of 1683
On player 380 of 1683
On player 381 of 1683
On player 382 of 1683
On player 383 of 1683
On player 384 of 1683
On player 385 of 1683
On player 386 of 1683
On player 387 of 1683
On player 388 of 1683
On player 389 of 1683
On player 390 of 1683
On player 391 of 1683
On player 392 of 1683
On player 393 of 1683
On player 394 of 1683
On player 395 of 1683
On player 396 of 1683
On player 397 of 1683
On player 398 of 1683
On player 399 of 1683
On player 400 of 1683
On player 401 of 1683
On player 402 of 1683
On player 403 of 1683
On player 404 of 1683
On player 405 of 1683
On player 406 of 1683
On player 407 of 1683
On player 408 of 1683
On player 409 of 1683
On player 410 of 1683
On player 411 of 1683
On player 412 of 1683
On player 413 of 1683
On player 414 of 1683
On player 415 of 1683
On player 416 of 1683
On player 417 of 1683
On player 418 of 1683
On player 419 of 1683
On player 420 of 1683
On player 421 of 1683
On player 422 of 1683
On player 

On player 751 of 1683
On player 752 of 1683
On player 753 of 1683
On player 754 of 1683
On player 755 of 1683
On player 756 of 1683
On player 757 of 1683
On player 758 of 1683
On player 759 of 1683
On player 760 of 1683
On player 761 of 1683
On player 762 of 1683
On player 763 of 1683
On player 764 of 1683
On player 765 of 1683
On player 766 of 1683
On player 767 of 1683
On player 768 of 1683
On player 769 of 1683
On player 770 of 1683
On player 771 of 1683
On player 772 of 1683
On player 773 of 1683
On player 774 of 1683
On player 775 of 1683
On player 776 of 1683
On player 777 of 1683
On player 778 of 1683
On player 779 of 1683
On player 780 of 1683
On player 781 of 1683
On player 782 of 1683
On player 783 of 1683
On player 784 of 1683
On player 785 of 1683
On player 786 of 1683
On player 787 of 1683
On player 788 of 1683
On player 789 of 1683
On player 790 of 1683
On player 791 of 1683
On player 792 of 1683
On player 793 of 1683
On player 794 of 1683
On player 795 of 1683
On player 

On player 1119 of 1683
On player 1120 of 1683
On player 1121 of 1683
On player 1122 of 1683
On player 1123 of 1683
On player 1124 of 1683
On player 1125 of 1683
On player 1126 of 1683
On player 1127 of 1683
On player 1128 of 1683
On player 1129 of 1683
On player 1130 of 1683
On player 1131 of 1683
On player 1132 of 1683
On player 1133 of 1683
On player 1134 of 1683
On player 1135 of 1683
On player 1136 of 1683
On player 1137 of 1683
On player 1138 of 1683
On player 1139 of 1683
On player 1140 of 1683
On player 1141 of 1683
On player 1142 of 1683
On player 1143 of 1683
On player 1144 of 1683
On player 1145 of 1683
On player 1146 of 1683
On player 1147 of 1683
On player 1148 of 1683
On player 1149 of 1683
On player 1150 of 1683
On player 1151 of 1683
On player 1152 of 1683
On player 1153 of 1683
On player 1154 of 1683
On player 1155 of 1683
On player 1156 of 1683
On player 1157 of 1683
On player 1158 of 1683
On player 1159 of 1683
On player 1160 of 1683
On player 1161 of 1683
On player 1

On player 1476 of 1683
On player 1477 of 1683
On player 1478 of 1683
On player 1479 of 1683
On player 1480 of 1683
On player 1481 of 1683
On player 1482 of 1683
On player 1483 of 1683
On player 1484 of 1683
On player 1485 of 1683
On player 1486 of 1683
On player 1487 of 1683
On player 1488 of 1683
On player 1489 of 1683
On player 1490 of 1683
On player 1491 of 1683
On player 1492 of 1683
On player 1493 of 1683
On player 1494 of 1683
On player 1495 of 1683
On player 1496 of 1683
On player 1497 of 1683
On player 1498 of 1683
On player 1499 of 1683
On player 1500 of 1683
On player 1501 of 1683
On player 1502 of 1683
On player 1503 of 1683
On player 1504 of 1683
On player 1505 of 1683
On player 1506 of 1683
On player 1507 of 1683
On player 1508 of 1683
On player 1509 of 1683
On player 1510 of 1683
On player 1511 of 1683
On player 1512 of 1683
On player 1513 of 1683
On player 1514 of 1683
On player 1515 of 1683
On player 1516 of 1683
On player 1517 of 1683
On player 1518 of 1683
On player 1

In [9]:
player_data_2022 = pd.DataFrame.from_dict(players, orient='index', columns=['Name', 'Is_Batter', 'Date', 'Team', 'Performance'])

In [10]:
player_data_2022.to_csv('2022_player_logs.csv')