In [7]:
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime, timedelta
import time
from IPython.display import clear_output
from config import conn_host, conn_database, conn_user, conn_password
import mysql.connector
from dateutil import tz

In [8]:
months = dict(Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12)

url = 'basketball/usa/nba'

In [57]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

def execute_multiple_queries(queries):
    try:
        db = connect_to_db()
        mycursor = db.cursor()
        for query in queries:
            mycursor.execute(query)

        db.commit()
        db.close()
    except Exception as e:
        print(e)

In [29]:
execute_query("ALTER TABLE games " +
              "ADD home_odds FLOAT NULL," +
              "ADD away_odds FLOAT NULL;")

'NoneType' object is not iterable




In [11]:
teams = execute_query("SELECT * FROM teams")

In [12]:
print(teams)

            id                    name abbreviation
0   1610612737           Atlanta Hawks          ATL
1   1610612738          Boston Celtics          BOS
2   1610612739     Cleveland Cavaliers          CLE
3   1610612740    New Orleans Pelicans          NOP
4   1610612741           Chicago Bulls          CHI
5   1610612742        Dallas Mavericks          DAL
6   1610612743          Denver Nuggets          DEN
7   1610612744   Golden State Warriors          GSW
8   1610612745         Houston Rockets          HOU
9   1610612746    Los Angeles Clippers          LAC
10  1610612747      Los Angeles Lakers          LAL
11  1610612748              Miami Heat          MIA
12  1610612749         Milwaukee Bucks          MIL
13  1610612750  Minnesota Timberwolves          MIN
14  1610612751           Brooklyn Nets          BKN
15  1610612752         New York Knicks          NYK
16  1610612753           Orlando Magic          ORL
17  1610612754          Indiana Pacers          IND
18  16106127

In [13]:
def get_betting_odds(season):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/Sao_Paulo')
    base_url = f"https://www.oddsportal.com/{url}{season}/results/"
    option = Options()
    option.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
    driver.get(base_url)
    time.sleep(5)
    
    try:
        button = driver.find_element("xpath", '/html/body/div[3]/div[3]/div/div[1]/div/div[2]/div/button[1]')
        button.click()
    except:
        print('A problem occoured while trying to close the cookies popup')
        
    button = driver.find_element("xpath", '/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[last()]')
    button.click()
    time.sleep(2)
    
    element = driver.find_element('id', "tournamentTable")
    htmlContent = element.get_attribute('outerHTML')
    page_soup = soup(htmlContent, "html.parser")
    active_page = int(''.join(page_soup.find('span', {"class": "active-page"}).findAll(text=True)))
    print('{}: Page {}'.format(season[1:], active_page))
    
    games = []
    
    while int(active_page) >= 1:
        dates = page_soup.findAll('tr', {"class": "center nob-border"})
        games_registered = 0
        
        for i in range(len(dates), 0, -1):
            try:
                if len(dates) == i:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})
                else:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})[:-1*games_registered]
                games_registered += len(date_games)
                date_info_splitted = dates[i-1].contents[0].text.split('-')
                if len(date_info_splitted) > 2: # Include named stages that are not playoffs
                    continue
                date_text = date_info_splitted[0].strip()
                for g in date_games:
                    try:
                        unix_date = datetime(int(date_text.split(' ')[2]), 
                                                        months[date_text.split(' ')[1]], 
                                                        int(date_text.split(' ')[0]), 
                                                        hour=int(g.contents[0].text.strip().split(':')[0]), 
                                                        minute=int(g.contents[0].text.strip().split(':')[1]), 
                                                        second=0).replace(tzinfo=from_zone).astimezone(to_zone)
                        home_team_string = g.contents[1].text.split(' - ')[0].strip()
                        away_team_string = g.contents[1].text.split(' - ')[1].strip()
                        home_team = teams.loc[teams['name'].str.lower() == home_team_string.lower()].iloc[0]
                        away_team = teams.loc[teams['name'].str.lower() == away_team_string.lower()].iloc[0]
                        games_parsed = [unix_date,
                                          home_team['id'], # Home Team Id
                                          away_team['id'], # Away Team Id
                                          home_team['name'], # Team A Name
                                          away_team['name'], # Team B Name
                                          g.contents[2].text.split(':')[0].strip(), # Team A Score
                                          g.contents[2].text.split(':')[1].replace('OT', '').strip(), # Team B Score
                                          g.contents[3].text,  # Team A Odds
                                          g.contents[4].text]  # Team B Odds
                        games.append(games_parsed)
                    except:
                        continue
                # print('{} Games appended'.format(len(games_parsed)))
            except Exception as e:
                print(e)
                continue
        
        btn_next_page = driver.find_element("xpath", '/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[2]')
        btn_next_page.click()
        time.sleep(4)
        
        element = driver.find_element('id', "tournamentTable")
        htmlContent = element.get_attribute('outerHTML')
        page_soup = soup(htmlContent, "html.parser")
        active_page -= 1
        clear_output(wait=True)
        print('{}: Page {}'.format(season[1:], active_page))
    
    driver.quit()
    return games
    

In [14]:
season = 2008
games = []
while season < 2022:
    games.extend(get_betting_odds(f'-{season}-{season+1}'))
    season += 1

# games.extend(get_betting_odds('')) # For the current season

odds_df = pd.DataFrame(games, columns=['GAME_DATE', 'TEAM_A_ID', 'TEAM_B_ID', 'TEAM_A', 'TEAM_B', 'TEAM_A_SCORE', 'TEAM_B_SCORE', 'TEAM_A_ODDS', 'TEAM_B_ODDS'])
print(len(games))

2021-2022: Page 0
17821


In [19]:
odds_df.head()

Unnamed: 0,GAME_DATE,TEAM_A_ID,TEAM_B_ID,TEAM_A,TEAM_B,TEAM_A_SCORE,TEAM_B_SCORE,TEAM_A_ODDS,TEAM_B_ODDS
0,2008-10-28 23:30:00-02:00,1610612741,1610612749,Chicago Bulls,Milwaukee Bucks,108,95,1.36,3.2
1,2008-10-28 23:00:00-02:00,1610612738,1610612739,Boston Celtics,Cleveland Cavaliers,90,85,1.36,3.15
2,2008-10-29 23:30:00-02:00,1610612745,1610612763,Houston Rockets,Memphis Grizzlies,82,71,1.1,7.8
3,2008-10-29 23:00:00-02:00,1610612765,1610612754,Detroit Pistons,Indiana Pacers,100,94,1.13,5.75
4,2008-10-29 23:00:00-02:00,1610612750,1610612758,Minnesota Timberwolves,Sacramento Kings,98,96,1.44,2.8


In [53]:
insert_queries = []
errors = []

query = f"SELECT * from games WHERE season >= 2008 and season <= 2022"
db_game_ids = execute_query(query)

for index, row in odds_df.iterrows():
    try:
        if row['TEAM_A_ODDS'] == '-' or row['TEAM_B_ODDS'] == '-':
            continue

        game_date = row['GAME_DATE'] - timedelta(hours=4)
        date_minus_one_day = game_date - timedelta(days=1)
        date_plus_one_day = game_date + timedelta(days=1)
        game_id = db_game_ids.loc[(db_game_ids['home_id'] == row['TEAM_A_ID']) & (db_game_ids['away_id'] == row['TEAM_B_ID']) & (db_game_ids['date'].dt.date >= date_minus_one_day.date()) & (db_game_ids['date'].dt.date <= date_plus_one_day.date())].iloc[0,:]['id']
        update_query = f"UPDATE games SET home_odds = {row['TEAM_A_ODDS']}, away_odds = {row['TEAM_B_ODDS']} WHERE id = {game_id}"
        insert_queries.append(update_query)
        clear_output(wait=True)
        print(f"{index}/{len(odds_df.index)}")
    except Exception as e:
        errors.append((row, e))
        continue

17820/17821


In [55]:
if not len(errors):
    print('There were no errors while updating the games odds.')
else:
    [print(f"{e[0]['GAME_DATE']} {e[0]['TEAM_A']} x {e[0]['TEAM_B']}: {e[1]}") for e in errors]

2020-08-15 15:30:00-03:00 Portland Trail Blazers x Memphis Grizzlies: single positional indexer is out-of-bounds


In [58]:
try:
    execute_multiple_queries(insert_queries)
except Exception as e:
    print(e)

In [61]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, ht.name as home_team, at.name as away_team, m.home_pts, m.away_pts, m.home_odds, m.away_odds FROM games AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) WHERE m.season = 2021 ORDER BY m.date ASC")

In [62]:
fixtures_df.tail()

Unnamed: 0,id,date,season,home_team,away_team,home_pts,away_pts,home_odds,away_odds
1312,42100402,2022-06-05,2021,Golden State Warriors,Boston Celtics,107,88,1.5,2.71
1313,42100403,2022-06-08,2021,Boston Celtics,Golden State Warriors,116,100,1.66,2.3
1314,42100404,2022-06-10,2021,Boston Celtics,Golden State Warriors,97,107,1.61,2.41
1315,42100405,2022-06-13,2021,Golden State Warriors,Boston Celtics,104,94,1.6,2.44
1316,42100406,2022-06-16,2021,Boston Celtics,Golden State Warriors,90,103,1.59,2.47
