In [21]:
# import the required packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
# load the webpage
url = 'https://www.baseball-reference.com/teams/COL/2025-schedule-scores.shtml'
response = requests.get(url)

In [3]:
# check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    print('Page loaded and parsed!')
else:
    print(f'Failed to load page: {response.status_code}')

Page loaded and parsed!


In [4]:
# find the table
table = soup.find('table', {'class': 'sortable stats_table'})

# check if the table was found
if table:
    print('Table found!')
else:
    print('Table not found.')

Table found!


In [5]:
# extract all rows in the table, skipping the header
rows = table.find_all('tr')

# check first 5 rows
for row in rows[:5]:
    print(row)

<tr>
<th aria-label="Gm#" class="poptip sort_default_asc center" data-stat="team_game" data-tip="&lt;strong&gt;Game Number&lt;/strong&gt;&lt;br&gt;Which game out of all played by this team." scope="col">Gm#</th>
<th aria-label="Date" class="poptip sort_default_asc center" data-stat="date_game" data-tip="A number in parentheses indicates which game of a doubleheader.&lt;br&gt;Click dates for box scores of games or standings on this day." scope="col">Date</th>
<th aria-label="" class="poptip sort_default_asc center" data-stat="boxscore" scope="col"></th>
<th aria-label="Tm" class="poptip sort_default_asc show_partial_when_sorting center" data-stat="team_ID" scope="col">Tm</th>
<th aria-label=" " class="poptip sort_default_asc show_partial_when_sorting left" data-stat="homeORvis" data-tip="&lt;strong&gt;Home or Away Game&lt;/strong&gt;&lt;br&gt;@ means it is an away game, blank means home." scope="col"> </th>
<th aria-label="Opp" class="poptip sort_default_asc show_partial_when_sorting ce

In [8]:
# extract the columns from a single row
columns = rows[1].find_all('td')  # skip the first row (headers)

# print out the first row's columns
for col in columns:
    print(col.text.strip())

Friday, Mar 28
boxscore
COL
@
TBR
L-wo
2
3

0-1
5
2.5
Fairbanks
Vodnik

2:19
D
10,046
.86
-



In [9]:
# extract and print the headers
headers = table.find_all('th')
header_text = [header.get_text(strip=True) for header in headers]
print('Headers:', header_text)

Headers: ['Gm#', 'Date', '', 'Tm', '', 'Opp', 'W/L', 'R', 'RA', 'Inn', 'W-L', 'Rank', 'GB', 'Win', 'Loss', 'Save', 'Time', 'D/N', 'Attendance', 'cLI', 'Streak', 'Orig. Scheduled', '1', '2', '3', '4', 'Gm#', 'April', '', 'Tm', '', 'Opp', 'W/L', 'R', 'RA', 'Inn', 'W-L', 'Rank', 'GB', 'Win', 'Loss', 'Save', 'Time', 'D/N', 'Attendance', 'cLI', 'Streak', 'Orig. Scheduled', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', 'Gm#', 'Date', '', 'Tm', '', 'Opp', '', '', 'D/N', 'cLI', 'Orig. Scheduled', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97',

In [10]:
print(header_text[0])
print(header_text[6])
print(header_text[7])
print(header_text[8])

Gm#
W/L
R
RA


In [18]:
# fetch the page
soup = BeautifulSoup(response.text, 'html.parser')

# loop through each row of the table (skip the header row)
games_data = []
gp = 1
for row in table.find_all('tr')[1:]:
    columns = row.find_all('td')

    # ensure there are enough columns
    if len(columns) > 8:
        result = columns[5].get_text(strip=True)
        runs_scored = columns[6].get_text(strip=True)
        runs_allowed = columns[7].get_text(strip=True)
        
        # skip non-numeric R or RA
        if not (runs_scored.isdigit() and runs_allowed.isdigit()):
            print(f'Skipping row. Non-numeric R or RA. {row.get_text(strip=True)}')
            continue

        games_data.append({
            'Games Played': gp,
            'Result': result,
            'R': runs_scored,
            'RA': runs_allowed
        })

        gp += 1

Skipping row. Non-numeric R or RA. 30Wednesday, Apr 30previewCOLATL3:10 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 31Thursday, May 1previewCOL@SFG9:45 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 32Friday, May 2previewCOL@SFG10:15 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 33Saturday, May 3previewCOL@SFG4:05 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 34Sunday, May 4previewCOL@SFG4:05 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 35Tuesday, May 6previewCOLDET8:40 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 36Wednesday, May 7previewCOLDET8:40 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 37Thursday, May 8previewCOLDET3:10 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 38Friday, May 9previewCOLSDP8:40 pmGame Preview, and Matchups
Skipping row. Non-numeric R or RA. 39Saturday, May 10previewCOLSDP8:10 pmGame Preview, and Matchups
Skipping

In [19]:
for game in games_data:
    print(game)

{'Games Played': 1, 'Result': 'L-wo', 'R': '2', 'RA': '3'}
{'Games Played': 2, 'Result': 'W', 'R': '2', 'RA': '1'}
{'Games Played': 3, 'Result': 'L', 'R': '4', 'RA': '6'}
{'Games Played': 4, 'Result': 'L', 'R': '1', 'RA': '6'}
{'Games Played': 5, 'Result': 'L', 'R': '1', 'RA': '5'}
{'Games Played': 6, 'Result': 'L', 'R': '1', 'RA': '3'}
{'Games Played': 7, 'Result': 'L', 'R': '3', 'RA': '6'}
{'Games Played': 8, 'Result': 'L', 'R': '4', 'RA': '7'}
{'Games Played': 9, 'Result': 'W', 'R': '12', 'RA': '5'}
{'Games Played': 10, 'Result': 'L', 'R': '1', 'RA': '7'}
{'Games Played': 11, 'Result': 'L', 'R': '2', 'RA': '17'}
{'Games Played': 12, 'Result': 'W', 'R': '7', 'RA': '2'}
{'Games Played': 13, 'Result': 'L', 'R': '0', 'RA': '8'}
{'Games Played': 14, 'Result': 'L', 'R': '0', 'RA': '2'}
{'Games Played': 15, 'Result': 'L', 'R': '0', 'RA': '6'}
{'Games Played': 16, 'Result': 'L', 'R': '3', 'RA': '5'}
{'Games Played': 17, 'Result': 'L', 'R': '2', 'RA': '6'}
{'Games Played': 18, 'Result': 'L',

In [22]:
df = pd.DataFrame(games_data)
df.to_csv('../Data/Raw/COL_2025_schedule.csv', index=False)