In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime, timedelta
import time
from IPython.display import clear_output
from config import conn_host, conn_database, conn_user, conn_password
import mysql.connector
from dateutil import tz

In [2]:
months = dict(Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12)

# If you want to change the league, browse through the odds portal website to find the correct url parameters for you
country = "brazil"
league = "serie-a"

In [3]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [5]:
execute_query("ALTER TABLE matches " +
              "ADD home_odds FLOAT NULL," +
              "ADD away_odds FLOAT NULL," +
              "ADD draw_odds FLOAT NULL;")

'NoneType' object is not iterable


In [4]:
execute_query("ALTER TABLE matches " +
              "DROP home_odds," +
              "DROP away_odds," +
              "DROP draw_odds;")

Execution failed on sql 'ALTER TABLE matches DROP home_odds,DROP away_odds,DROP draw_odds;': 1091 (42000): Can't DROP 'home_odds'; check that column/key exists


In [6]:
teams = execute_query("SELECT * FROM teams")

In [7]:
print(teams)

      id            name
0    118           Bahia
1    119   Internacional
2    120     Botafogo RJ
3    121       Palmeiras
4    122          Parana
5    123    Sport Recife
6    124      Fluminense
7    125      America MG
8    126       Sao Paulo
9    127     Flamengo RJ
10   128          Santos
11   129           Ceara
12   130          Gremio
13   131     Corinthians
14   132  Chapecoense-SC
15   133           Vasco
16   134    Athletico-PR
17   135        Cruzeiro
18   136         Vitoria
19   137     Figueirense
20   139     Ponte Preta
21   140        Criciuma
22   144     Atletico GO
23   145            Avai
24   147        Coritiba
25   150             CSA
26   151           Goias
27   152       Juventude
28   154       Fortaleza
29   753      Santa Cruz
30   755         Nautico
31   794      Bragantino
32   795       Joinville
33  1062     Atletico-MG
34  1193          Cuiaba
35  1214      Portuguesa


In [8]:
def get_betting_odds(season):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/Sao_Paulo')
    base_url = f"https://www.oddsportal.com/soccer/{country}/{league}{season}/results/"
    option = Options()
#     option.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
    driver.get(base_url)
    time.sleep(5)
    
    try:
        button = driver.find_element_by_xpath('/html/body/div[3]/div[3]/div/div[1]/div/div[2]/div/button[1]')
        button.click()
    except:
        print('A problem occoured while trying to close the cookies popup')
        
    button = driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[11]')
    button.click()
    time.sleep(2)
    
    element = driver.find_elements_by_id("tournamentTable")[0]
    htmlContent = element.get_attribute('outerHTML')
    page_soup = soup(htmlContent, "html.parser")
    active_page = int(''.join(page_soup.find('span', {"class": "active-page"}).findAll(text=True)))
    print('{}: Page {}'.format(season[1:], active_page))
    
    games = []
    
    while int(active_page) >= 1:
        dates = page_soup.findAll('tr', {"class": "center nob-border"})
        games_registered = 0
        
        for i in range(len(dates), 0, -1):
            try:
                if len(dates) == i:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})
                else:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})[:-1*games_registered]
                games_registered += len(date_games)
                date_info_splitted = dates[i-1].contents[0].text.split('-')
                if len(date_info_splitted) > 1:
                    continue
                date_text = date_info_splitted[0].strip()
                for g in date_games:
                    unix_date = datetime(int(date_text.split(' ')[2]), 
                                                    months[date_text.split(' ')[1]], 
                                                    int(date_text.split(' ')[0]), 
                                                    hour=int(g.contents[0].text.strip().split(':')[0]), 
                                                    minute=int(g.contents[0].text.strip().split(':')[1]), 
                                                    second=0).replace(tzinfo=from_zone).astimezone(to_zone)
                    home_team_string = g.contents[1].text.split(' - ')[0].strip()
                    away_team_string = g.contents[1].text.split(' - ')[1].strip()
                    home_team = teams.loc[teams['name'] == home_team_string].iloc[0]
                    away_team = teams.loc[teams['name'] == away_team_string].iloc[0]
                    games_parsed = [unix_date,
                                      home_team['id'], # Home Team Id
                                      away_team['id'], # Away Team Id
                                      home_team['name'], # Team A Name
                                      away_team['name'], # Team B Name
                                      g.contents[2].text.split(':')[0].strip(), # Team A Score
                                      g.contents[2].text.split(':')[1].replace('OT', '').strip(), # Team B Score
                                      g.contents[3].text,  # Team A Odds
                                      g.contents[4].text,  # Draw Odds
                                      g.contents[5].text]  # Team B Odds
                    games.append(games_parsed)
                # print('{} Games appended'.format(len(games_parsed)))
            except Exception as e:
                print(e)
                continue
        
        btn_next_page = driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[2]')
        btn_next_page.click()
        time.sleep(4)
        
        element = driver.find_elements_by_id("tournamentTable")[0]
        htmlContent = element.get_attribute('outerHTML')
        page_soup = soup(htmlContent, "html.parser")
        active_page -= 1
        clear_output(wait=True)
        print('{}: Page {}'.format(season[1:], active_page))
    
    driver.quit()
    return games
    

In [9]:
season = 2014
games = []
while season < 2022:
    games.extend(get_betting_odds(f'-{season}'))
    season += 1

odds_df = pd.DataFrame(games, columns=['GAME_DATE', 'TEAM_A_ID', 'TEAM_B_ID', 'TEAM_A', 'TEAM_B', 'TEAM_A_SCORE', 'TEAM_B_SCORE', 'TEAM_A_ODDS', 'DRAW_ODDS', 'TEAM_B_ODDS'])
# odds_df.to_csv('../data/odds.csv')
print(len(games))

2021: Page 0
3031


In [10]:
odds_df.head()

Unnamed: 0,GAME_DATE,TEAM_A_ID,TEAM_B_ID,TEAM_A,TEAM_B,TEAM_A_SCORE,TEAM_B_SCORE,TEAM_A_ODDS,DRAW_ODDS,TEAM_B_ODDS
0,2014-04-19 18:30:00-03:00,124,137,Fluminense,Figueirense,3,0,1.53,3.94,6.5
1,2014-04-19 18:30:00-03:00,119,136,Internacional,Vitoria,1,0,1.48,4.15,7.06
2,2014-04-20 18:30:00-03:00,140,121,Criciuma,Palmeiras,1,2,2.95,3.19,2.42
3,2014-04-20 18:30:00-03:00,127,151,Flamengo RJ,Goias,0,0,2.12,3.28,3.54
4,2014-04-20 18:30:00-03:00,128,123,Santos,Sport Recife,1,1,1.39,4.51,8.08


In [11]:
game_errors = []
for index, row in odds_df.iterrows():
    try:
        date_minus_one_day = row['GAME_DATE'] - timedelta(days=1)
        date_plus_one_day = row['GAME_DATE'] + timedelta(days=1)
        game_id = execute_query(f"SELECT id from matches WHERE (DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{row['GAME_DATE']}','%d/%m/%Y') OR DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{date_minus_one_day}','%d/%m/%Y') OR DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{date_plus_one_day}','%d/%m/%Y')) AND home_id = {row['TEAM_A_ID']} AND away_id = {row['TEAM_B_ID']}").iloc[0]['id']
        update_query = f"UPDATE matches SET home_odds = {row['TEAM_A_ODDS']}, away_odds = {row['TEAM_B_ODDS']}, draw_odds = {row['DRAW_ODDS']} WHERE id = {game_id}"
        execute_query(update_query, False)
    except Exception as e:
        new_item = dict()
        new_item['error'] = e
        new_item['match'] = row
        game_errors.append(new_item)
    clear_output(wait=True)
    print(f"{index}/{len(odds_df.index)}")

3030/3031


In [12]:
if len(game_errors) == 0:
    print('There were no errors while reading games from the Odds Portal website.')
else:
    print('The following games had an error while trying to read games from the Odds Portal website:')
    for item in game_errors:
        print(f"\n{item['match']['GAME_DATE']}: {item['match']['TEAM_A']}({item['match']['TEAM_A_ID']}) {item['match']['TEAM_A_SCORE']} x {item['match']['TEAM_B_SCORE']} {item['match']['TEAM_B']}({item['match']['TEAM_B_ID']})")
        print(f"ERROR: {item['error']}")

There were no errors while reading games from the Odds Portal website.


In [15]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.name AS league, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE m.season = 2021 ORDER BY m.date ASC")

In [16]:
fixtures_df.head()

Unnamed: 0,id,date,season,league,home_team,away_team,home_score,away_score,home_odds,away_odds,draw_odds
0,688918,2021-05-29 19:00:00,2021,Serie A (Brazil),Cuiaba,Juventude,2,2,2.22,3.52,3.11
1,688915,2021-05-29 20:00:00,2021,Serie A (Brazil),Bahia,Santos,3,0,2.37,3.12,3.2
2,688911,2021-05-29 21:00:00,2021,Serie A (Brazil),Sao Paulo,Fluminense,0,0,1.91,4.62,3.18
3,688912,2021-05-30 11:00:00,2021,Serie A (Brazil),Atletico-MG,Fortaleza,1,2,1.48,6.93,4.19
4,688909,2021-05-30 16:00:00,2021,Serie A (Brazil),Flamengo RJ,Palmeiras,1,0,1.82,4.38,3.57
