In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re

In [9]:
initial_url = "https://crossroadsleague.com/sports/bsb/2006-07/schedule?jsRendering=true"

In [10]:
headers = {
    "User-Agent": "Chrome/120.0.0.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://crossroadsleague.com/"
}

page = requests.get(initial_url, headers=headers).text
soup = BeautifulSoup(page, 'html.parser')

In [11]:
column_headers = ['date', 'team1', 'team2', 'score1', 'score2']
df = pd.DataFrame(columns=column_headers)

# Scraping Data Across Every Game

In [12]:
urls = [option.get('value') for option in soup.find('select', id='filter-by-season').find_all('option')]

In [2]:
def scrape(url):
    
    page = requests.get("https://crossroadsleague.com" + url, headers=headers).text
    
    soup = BeautifulSoup(page, 'html.parser')

    season = soup.find('div', class_='page-content-header').text.strip().split(' ')[0]
    
    start_year = None
    end_year = None
    
    if '-' in season:
        #start_year = int(season.split('-')[0])
        start_year = int(re.split(r'-', season)[0])
    else:
        end_year = int(season)

    # Finding the score results
    results = soup.find_all('div', class_='result')
    
    for result in results:

        # Date
        day = result.find_parent('div', class_='section-event-date').get('data-date')
        month = result.find_parent('div', class_='section-event-month').get('class')[1]

        month_num = datetime.strptime(month, "%B").month

        if month_num >= 8 and start_year:
            year = start_year
        elif month_num >= 8 and end_year:
            year = end_year - 1
            
        if month_num < 8 and start_year:
            year = start_year + 1
        elif month_num < 8 and end_year:
            year = end_year

        date_str = f"{month} {day} {year}"
        date = datetime.strptime(date_str, "%B %a. %d %Y")
        
        # Team Name
        team_name = result.find_all('span', class_='team-name')[0].text.strip()
        
        # Opponent Name
        opponent_name = result.find_all('span', class_='team-name')[1].text.strip()

        # Scores
        team_score = result.find_all('div', class_='flex-shrink-1')[0].text.strip()
        opponent_score = result.find_all('div', class_='flex-shrink-1')[1].text.strip()
        
        df.loc[len(df)] = [date, team_name, opponent_name, team_score, opponent_score]

In [14]:
for url in urls:
    scrape(url)
df

Unnamed: 0,date,team1,team2,score1,score2
0,2024-09-20,Michigan-Dearborn,Spring Arbor (MI),5,6
1,2024-09-20,Michigan-Dearborn,Spring Arbor (MI),6,15
2,2024-09-21,Michigan-Dearborn,Spring Arbor (MI),3,4
3,2024-09-21,Michigan-Dearborn,Spring Arbor (MI),0,9
4,2024-09-27,Bethel (IN),Michigan-Dearborn,16,3
...,...,...,...,...,...
4673,2007-05-16,Mount Vernon Nazarene,Trinity International,3,6
4674,2007-05-17,St. Ambrose,Spring Arbor,3,11
4675,2007-05-17,Mount Vernon Nazarene,Trinity Christian,11,3
4676,2007-05-18,St. Ambrose,Spring Arbor,1,6


# Clean Dataset

In [4]:
# check for missing values
df_clean = pd.read_csv('../data/baseball_games.csv')
missing_values = df_clean.isnull().sum()
#print(missing_values)

#df.info()

# change datatype of scores from object to numbers
df_clean = df_clean.astype({
    'score1': int,
    'score2': int
})


# standardize text data
#print(df['team1'].value_counts())
df_clean['team1'] = df_clean['team1'].str.lower()
df_clean['team2'] = df_clean['team2'].str.lower()

same_team = {
    'grace' : 'grace (in)',
    'goshen' : 'goshen (in)',
    'huntington' : 'huntington (in)',
    'indiana wesleyan' : 'indiana wesleyan (in)',
    'marian' : 'marian (in)',
    'mount vernon nazarene' : 'mount vernon nazarene (oh)',
    'saint francis (ind.)' : 'saint francis (in)',
    'spring arbor' : 'spring arbor (in)',
    'taylor' : 'taylor (in)',
    'aquinas' : 'aquinas (mi)',
    'faulkner' : 'faulkner (al)',
    'bryan' : 'bryan (tn)',
    'union' : 'union (ky)'
}

df_clean['team1'] = df_clean['team1'].replace(same_team)
df_clean['team2'] = df_clean['team2'].replace(same_team)

#df_clean['team2'].value_counts()

In [5]:
df_clean = df_clean.sort_values(by='date', ascending=True)
df_clean.to_csv('../data/baseball_games.csv', index=False)