[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cgrilson7/mlb-weather/blob/master/scrape_odds.ipynb)

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import pytz
import time
import statistics
import tqdm
import warnings

In [2]:
def scrape_odds(start_date, end_date):
    '''
    Scrapes odds from donbest.com between the two dates
    '''
    dates = pd.date_range(start=start_date, end=end_date, tz="US/Eastern")
    dates = dates[(dates.month >= 3) & (dates.month <= 11)]
    
    odds = []
    for date in tqdm(dates):
        time.sleep(np.random.uniform(0.0, 0.1))
        html = requests.get("http://www.donbest.com/mlb/odds/totals/" + date.strftime("%Y%m%d") + ".html")
        if html.status_code == 200:
            try:
                soup = BeautifulSoup(html.content, 'html.parser')
                rows = soup.find_all("tr", {"class" : "statistics_table_row"})
                alt_rows = soup.find_all("tr", {"class" : "statistics_table_alternateRow"})
                for i in range(len(rows)):
                    try:
                        # Get the team names
                        teams = rows[i].find("td", {"class" : "alignLeft"}).find("a").find_all("span")
                        away_team = teams[0].text
                        home_team = teams[1].text
                        # Get the start time
                        start_time = rows[i].find_all("td", {"class" : "alignCenter"})[0].find("div").text
                        start_datetime = pd.to_datetime(date.strftime("%Y-%m-%d") + "T" + start_time).replace(tzinfo = pytz.timezone("US/Eastern"))
                        # Get scores
                        scores = rows[i].find_all("td", {"class" : "alignCenter"})[1]
                        away_score = scores.find("div").find("b").text
                        home_score = scores.find_all("div")[1].find("b").text
                        # List to hold the run total lines for each sportsbook
                        totals = []
                        books = rows[i].find_all("td", {"class" : "alignRight bookColumn"})
                        for j in range(len(books)):
                            try:
                                total = float(books[j].find_all("div", {"class" : "oddsAlignMiddleOne"})[0].text)
                                if ((total > 0) & (total < 25)):
                                    totals.append(total)
                                # If wanted, we can also add the shades at some point...
                            except:
                                pass

                        # Use the median of all lines as the over/under
                        over_under = statistics.median(totals)
                        # Append to odds[]
                        odds.append([away_team,
                            home_team,
                            start_datetime,
                            away_score,
                            home_score,
                            over_under])
                    except:
                        pass
                # Repeat for "alternateRows"
                for i in range(len(alt_rows)):
                    try:
                        # Get the team names
                        teams = alt_rows[i].find("td", {"class" : "alignLeft"}).find("a").find_all("span")
                        away_team = teams[0].text
                        home_team = teams[1].text
                        # Get the start time
                        start_time = alt_rows[i].find_all("td", {"class" : "alignCenter"})[0].find("div").text
                        start_datetime = pd.to_datetime(date.strftime("%Y-%m-%d") + "T" + start_time).replace(tzinfo = pytz.timezone("US/Eastern"))
                        # Get scores
                        scores = alt_rows[i].find_all("td", {"class" : "alignCenter"})[1]
                        away_score = scores.find("div").find("b").text
                        home_score = scores.find_all("div")[1].find("b").text
                        # List to hold the run total lines for each sportsbook
                        totals = []
                        books = alt_rows[i].find_all("td", {"class" : "alignRight bookColumn"})
                        for j in range(len(books)):
                            try:
                                total = float(books[j].find_all("div", {"class" : "oddsAlignMiddleOne"})[0].text)
                                if ((total > 0) & (total < 25)):
                                    totals.append(total)
                                # If wanted, we can also add the shades at some point...
                            except:
                                pass
                        # Use the median of all lines as the over/under
                        over_under = statistics.median(totals)
                        # Append to odds[]
                        odds.append([away_team,
                            home_team,
                            start_datetime,
                            away_score,
                            home_score,
                            over_under])
                    except:
                        pass
            except:
                pass
        else:
            # Why aren't these showing?
            # sys.stderr("Page cannot be loaded. Are you sure there were games on this date: "+date.strftime("%Y-%m-%d")+"?")
            # warnings.warn(message=("Page cannot be loaded. Are you sure there were games on this date:" + date.strftime("%Y-%m-%d")+"?"),RuntimeWarning)
            pass
        
    return pd.DataFrame(odds, columns = ["away", "home", "start_dt", "away_score", "home_score", "over_under"])


In [3]:
odds_13 = scrape_odds("2013-03-01", "2013-11-29")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for date in tqdm.tqdm_notebook(dates):


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))




In [0]:
odds_14 = scrape_odds("2014-03-01", "2014-11-29")

HBox(children=(IntProgress(value=0, max=274), HTML(value='')))




In [0]:
odds_15 = scrape_odds("2015-03-01", "2015-11-29")

HBox(children=(IntProgress(value=0, max=274), HTML(value='')))




In [0]:
odds_16 = scrape_odds("2016-03-01", "2016-11-29")

HBox(children=(IntProgress(value=0, max=274), HTML(value='')))




In [0]:
odds_17 = scrape_odds("2017-03-01", "2017-11-29")

HBox(children=(IntProgress(value=0, max=274), HTML(value='')))

In [None]:
odds_18 = scrape_odds("2018-03-01", "2018-11-29")

In [None]:
odds_19 = scrape_odds("2019-03-01", "2019-11-29")

In [0]:
odds_df = pd.concat([odds_13,
                    odds_14,
                    odds_15,
                    odds_16,
                    odds_17])
# odds_df.to_csv("input/odds.csv", index=False)