In [5]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

from tqdm import tqdm


# Scrape Code

## Page Scrape Function

In [171]:
def get_page_data(table_rows):

    # Extract team names and odds
    game_dates = []
    home_teams = []
    away_teams = []
    home_odds = []
    away_odds = []
    home_scores = []
    away_scores = []

    for i, table_row in enumerate(table_rows):

        ## Get the date
        game_date = table_row.find('div', class_='text-black-main font-main w-full truncate text-xs font-normal leading-5')
        try:
            game_date = game_date.get_text(strip=True)
        except:
            game_date = last_game_date

        last_game_date = game_date
        game_dates.append(game_date)

        ## Get the team names
        team_names = table_row.find_all('p', class_='truncate participant-name')
        home_teams.append( team_names[0].get_text(strip=True) )
        away_teams.append( team_names[1].get_text(strip=True) )

        ## Get the scores
        team_scores = table_row.find_all('div', class_='min-mt:!flex')
        if len(team_scores) != 0:
            home_scores.append( team_scores[0].get_text(strip=True) )
            away_scores.append( team_scores[1].get_text(strip=True) )
        else:
            home_scores.append(np.nan)
            away_scores.append(np.nan)

        ## Get the odds
        odds = table_row.find_all('p', class_='height-content')
        home_odds.append( odds[0].get_text(strip=True) )
        away_odds.append( odds[1].get_text(strip=True) ) 


    odds_df = pd.DataFrame({
        'Date': game_dates,
        'Home': home_teams,
        'Away': away_teams,
        'Home_Odds': home_odds,
        'Away_Odds': away_odds,
        'Home_Score': home_scores,
        'Away_Score': away_scores
    })

    return odds_df


## Scrape Page and Create Dataframe Function

In [172]:
def scrape_html_get_data(year, page_num):
    oddsportal_link = fr'https://www.oddsportal.com/baseball/usa/mlb-{year}/results/#/page/{page_num}'

    scrolls = 2

    # Initialize the WebDriver (you should specify the path to your WebDriver executable)
    driver = webdriver.Chrome()

    # Load the webpage
    driver.get(oddsportal_link)

    # Define a function to scroll down the page
    def scroll_down(driver):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust the sleep time as needed

    # Scroll down a few times to load more content
    for _ in range(scrolls):  # You can adjust the number of times to scroll
        scroll_down(driver)

    # Get the page source (which includes dynamically loaded content)
    page_source = driver.page_source

    # Close the WebDriver
    driver.quit()

    soup = BeautifulSoup(page_source, 'html.parser')

    table_rows = soup.find_all('div', class_='eventRow')
    try:
        odds_df = get_page_data(table_rows)
    except IndexError:
        print(f"Error in {year} on page {page_num}")
        odds_df = pd.DataFrame({})

    return odds_df


## Run Scrape

In [178]:
scrapeBettingOdds = False

year_range = range(2023, 2024)
page_seq = range(1, 53)

if scrapeBettingOdds:
    for year in tqdm(year_range):
        odds_dfs = []
        for page in page_seq:
            odds_dfs.append( scrape_html_get_data(year, page) )

        year_odds = pd.concat(odds_dfs)
        year_odds.to_csv(fr'/Users/willmiraglia/Desktop/Datathon 2024/Odds/{year}_Odds.csv', index=False)
        

# Data Cleaning

## Odds Conversion Functions

In [36]:
def american_to_implied(odds):
    if odds >= 0:
        return 100*(100 / (odds + 100))
    else:
        return 100*(abs(odds) / (abs(odds) + 100))

## Download and Clean Data

In [37]:
odds_dict = {}
for year in range(2008, 2024):
    baseball_base_path = fr'/Users/willmiraglia/Desktop/Datathon 2024/Odds/{year}_Odds.csv'
    odds_df = pd.read_csv(baseball_base_path)
    odds_dict[year] = odds_df

all_odds_df = pd.concat(odds_dict)
all_odds_df = all_odds_df.reset_index(drop=True)
all_odds_df['Date'] = all_odds_df['Date'].apply(lambda x: pd.to_datetime(' '.join(x.split()[:3])).date())
all_odds_df['Home_Odds'] = all_odds_df['Home_Odds'].apply(lambda x: float(x) if x != '-' else None)
all_odds_df['Away_Odds'] = all_odds_df['Away_Odds'].apply(lambda x: float(x) if x != '-' else None)
all_odds_df = all_odds_df.sort_values(by='Date', ascending=True).dropna().reset_index(drop=True)

excluded_teams = ['Campeche', 'National League', 'American League', 'Australia', 'Sacramento River Cats', 'Tabasco', 'Northeastern']
all_odds_df = all_odds_df[~all_odds_df['Home'].isin(excluded_teams) & ~all_odds_df['Away'].isin(excluded_teams)]


## Convert Team Names to Abbreviations

In [38]:
team_name_to_abrev = {
    'Oakland Athletics':     'OAK', 
    'Washington Nationals':  'WAS', 
    'Chicago Cubs':          'CHC',
    'Philadelphia Phillies': 'PHI', 
    'Cleveland Guardians':   'CLE',
    'Baltimore Orioles':     'BAL', 
    'Miami Marlins':         'MIA', 
    'Seattle Mariners':      'SEA',
    'Atlanta Braves':        'ATL', 
    'Cincinnati Reds':       'CIN', 
    'Detroit Tigers':        'DET',
    'Los Angeles Dodgers':   'LAD', 
    'St.Louis Cardinals':    'STL', 
    'Minnesota Twins':       'MIN',
    'New York Yankees':      'NYY', 
    'San Diego Padres':      'SD', 
    'Arizona Diamondbacks':  'AZ',
    'Colorado Rockies':      'COL', 
    'Houston Astros':        'HOU', 
    'Chicago White Sox':     'CWS',
    'San Francisco Giants':  'SF', 
    'Pittsburgh Pirates':    'PIT', 
    'Los Angeles Angels':    'LAA',
    'Tampa Bay Rays':        'TB',
    'New York Mets':         'NYM', 
    'Toronto Blue Jays':     'TOR',
    'Kansas City Royals':    'KC', 
    'Boston Red Sox':        'BOS', 
    'Texas Rangers':         'TEX',
    'Milwaukee Brewers':     'MIL' 
}

all_odds_df['Home'] = all_odds_df['Home'].map(team_name_to_abrev)
all_odds_df['Away'] = all_odds_df['Away'].map(team_name_to_abrev)


## Convert Odds

In [42]:
all_odds_df['Home_IP'] = all_odds_df['Home_Odds'].apply(american_to_implied)
all_odds_df['Away_IP'] = all_odds_df['Away_Odds'].apply(american_to_implied)

home_ip = all_odds_df['Home_IP']
away_ip = all_odds_df['Away_IP']
total_ip = all_odds_df['Home_IP'] + all_odds_df['Away_IP']

all_odds_df['Home_IP'] = home_ip / total_ip
all_odds_df['Away_IP'] = away_ip / total_ip


## Download Final DataFrame

In [61]:
all_ip_df = all_odds_df[['Date', 'Home', 'Away', 'Home_Score', 'Away_Score', 'Home_IP', 'Away_IP']]
all_ip_df = all_ip_df.rename(columns={'Date': 'game_date', 
                                      'Home': 'home_team', 
                                      'Away': 'away_team', 
                                      'Home_Score': 'home_score',
                                      'Away_Score': 'away_score',
                                      'Home_IP': 'home_odds', 
                                      'Away_IP': 'away_odds'
                                    })

all_ip_df.to_csv(r'/Users/willmiraglia/Desktop/Datathon 2024/Odds/Full_Odds.csv', index=False)