In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime, timedelta
import time
from IPython.display import clear_output
from config import conn_host, conn_database, conn_user, conn_password
import mysql.connector
from dateutil import tz

In [2]:
months = dict(Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12)

# If you want to change the league, browse through the odds portal website to find the correct url parameters for you
country = "japan"
league = "j1-league"

In [3]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [4]:
execute_query("ALTER TABLE matches " +
              "ADD home_odds FLOAT NULL," +
              "ADD away_odds FLOAT NULL," +
              "ADD draw_odds FLOAT NULL;")

Execution failed on sql 'ALTER TABLE matches ADD home_odds FLOAT NULL,ADD away_odds FLOAT NULL,ADD draw_odds FLOAT NULL;': 1060 (42S21): Duplicate column name 'home_odds'




In [5]:
teams = execute_query("SELECT * FROM teams")

In [6]:
print(teams)

        id            name
0       33  Manchester Utd
1       34       Newcastle
2       35     Bournemouth
3       36          Fulham
4       37    Huddersfield
..     ...             ...
184  12277        Ipatinga
185  13173      Chivas USA
186  13974      Santa Cruz
187  16489       Austin FC
188  18310       Charlotte

[189 rows x 2 columns]


In [8]:
def get_betting_odds(season):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/Sao_Paulo')
    if '2014' in season and league == "j1-league": # Different name for J1 league for the 2014 season
        base_url = f"https://www.oddsportal.com/soccer/{country}/j-league{season}/results/"
    else:
        base_url = f"https://www.oddsportal.com/soccer/{country}/{league}{season}/results/"
    option = Options()
    option.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
    driver.get(base_url)
    time.sleep(5)
    
    try:
        button = driver.find_element("xpath", '/html/body/div[3]/div[3]/div/div[1]/div/div[2]/div/button[1]')
        button.click()
    except:
        print('A problem occoured while trying to close the cookies popup')
        
    button = driver.find_element("xpath", '/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[last()]')
    button.click()
    time.sleep(2)
    
    element = driver.find_element('id', "tournamentTable")
    htmlContent = element.get_attribute('outerHTML')
    page_soup = soup(htmlContent, "html.parser")
    active_page = int(''.join(page_soup.find('span', {"class": "active-page"}).findAll(text=True)))
    print('{}: Page {}'.format(season[1:], active_page))
    
    games = []
    
    while int(active_page) >= 1:
        dates = page_soup.findAll('tr', {"class": "center nob-border"})
        games_registered = 0
        
        for i in range(len(dates), 0, -1):
            try:
                if len(dates) == i:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})
                else:
                    date_games = dates[i-1].find_next_siblings('tr', {"class": "deactivate"})[:-1*games_registered]
                games_registered += len(date_games)
                date_info_splitted = dates[i-1].contents[0].text.split('-')
                if len(date_info_splitted) > 1 and 'stage' not in date_info_splitted[1]: # Include named stages that are not playoffs
                    continue
                date_text = date_info_splitted[0].strip()
                for g in date_games:
                    try:
                        unix_date = datetime(int(date_text.split(' ')[2]), 
                                                        months[date_text.split(' ')[1]], 
                                                        int(date_text.split(' ')[0]), 
                                                        hour=int(g.contents[0].text.strip().split(':')[0]), 
                                                        minute=int(g.contents[0].text.strip().split(':')[1]), 
                                                        second=0).replace(tzinfo=from_zone).astimezone(to_zone)
                        home_team_string = g.contents[1].text.split(' - ')[0].strip()
                        away_team_string = g.contents[1].text.split(' - ')[1].strip()
                        home_team = teams.loc[teams['name'] == home_team_string].iloc[0]
                        away_team = teams.loc[teams['name'] == away_team_string].iloc[0]
                        games_parsed = [unix_date,
                                          home_team['id'], # Home Team Id
                                          away_team['id'], # Away Team Id
                                          home_team['name'], # Team A Name
                                          away_team['name'], # Team B Name
                                          g.contents[2].text.split(':')[0].strip(), # Team A Score
                                          g.contents[2].text.split(':')[1].replace('OT', '').strip(), # Team B Score
                                          g.contents[3].text,  # Team A Odds
                                          g.contents[4].text,  # Draw Odds
                                          g.contents[5].text]  # Team B Odds
                        games.append(games_parsed)
                    except:
                        continue
                # print('{} Games appended'.format(len(games_parsed)))
            except Exception as e:
                print(e)
                continue
        
        btn_next_page = driver.find_element("xpath", '/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[6]/div/a[2]')
        btn_next_page.click()
        time.sleep(4)
        
        element = driver.find_element('id', "tournamentTable")
        htmlContent = element.get_attribute('outerHTML')
        page_soup = soup(htmlContent, "html.parser")
        active_page -= 1
        clear_output(wait=True)
        print('{}: Page {}'.format(season[1:], active_page))
    
    driver.quit()
    return games
    

In [9]:
season = 2014
games = []
while season < 2022:
    games.extend(get_betting_odds(f'-{season}')) # Brazil/Same year leagues
#     games.extend(get_betting_odds(f'-{season}-{season+1}')) # European/Two year leagues leagues
    season += 1

games.extend(get_betting_odds('')) # For the current season

odds_df = pd.DataFrame(games, columns=['GAME_DATE', 'TEAM_A_ID', 'TEAM_B_ID', 'TEAM_A', 'TEAM_B', 'TEAM_A_SCORE', 'TEAM_B_SCORE', 'TEAM_A_ODDS', 'DRAW_ODDS', 'TEAM_B_ODDS'])
print(len(games))

: Page 0
2746


In [10]:
odds_df.head()

Unnamed: 0,GAME_DATE,TEAM_A_ID,TEAM_B_ID,TEAM_A,TEAM_B,TEAM_A_SCORE,TEAM_B_SCORE,TEAM_A_ODDS,DRAW_ODDS,TEAM_B_ODDS
0,2014-03-01 03:00:00-03:00,281,292,Kashiwa Reysol,FC Tokyo,1,1,2.26,3.35,3.01
1,2014-03-01 02:00:00-03:00,291,282,Cerezo Osaka,Sanfrecce Hiroshima,0,1,2.29,3.24,3.06
2,2014-03-01 02:00:00-03:00,308,290,Kofu,Kashima Antlers,0,4,3.38,3.33,2.1
3,2014-03-01 02:00:00-03:00,288,283,Nagoya Grampus,Shimizu S-Pulse,2,3,2.22,3.41,3.04
4,2014-03-01 02:00:00-03:00,295,299,Sagan Tosu,Tokushima,5,0,1.58,3.93,5.35


In [11]:
game_errors = []
for index, row in odds_df.iterrows():
    try:
        date_minus_one_day = row['GAME_DATE'] - timedelta(days=1)
        date_plus_one_day = row['GAME_DATE'] + timedelta(days=1)
        game_id = execute_query(f"SELECT id from matches WHERE (DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{row['GAME_DATE']}','%d/%m/%Y') OR DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{date_minus_one_day}','%d/%m/%Y') OR DATE_FORMAT(date,'%d/%m/%Y') = DATE_FORMAT('{date_plus_one_day}','%d/%m/%Y')) AND home_id = {row['TEAM_A_ID']} AND away_id = {row['TEAM_B_ID']}").iloc[0]['id']
        update_query = f"UPDATE matches SET home_odds = {row['TEAM_A_ODDS']}, away_odds = {row['TEAM_B_ODDS']}, draw_odds = {row['DRAW_ODDS']} WHERE id = {game_id}"
        execute_query(update_query, False)
    except Exception as e:
        new_item = dict()
        new_item['error'] = e
        new_item['match'] = row
        game_errors.append(new_item)
    clear_output(wait=True)
    print(f"{index}/{len(odds_df.index)}")

2745/2746


In [12]:
if len(game_errors) == 0:
    print('There were no errors while reading games from the Odds Portal website.')
else:
    print('The following games had an error while trying to read games from the Odds Portal website:')
    for item in game_errors:
        print(f"\n{item['match']['GAME_DATE']}: {item['match']['TEAM_A']}({item['match']['TEAM_A_ID']}) {item['match']['TEAM_A_SCORE']} x {item['match']['TEAM_B_SCORE']} {item['match']['TEAM_B']}({item['match']['TEAM_B_ID']})")
        print(f"ERROR: {item['error']}")

The following games had an error while trying to read games from the Odds Portal website:

2014-12-06 05:30:00-02:00: Albirex Niigata(311) 0 x 2 Kashiwa Reysol(281)
ERROR: single positional indexer is out-of-bounds

2022-08-07 07:00:00-03:00: Kawasaki Frontale(294) 2 x 1 Yokohama F. Marinos(296)
ERROR: single positional indexer is out-of-bounds

2022-08-07 07:00:00-03:00: Sagan Tosu(295) 2 x 0 Iwata(280)
ERROR: single positional indexer is out-of-bounds

2022-08-07 07:00:00-03:00: Shonan Bellmare(284) 1 x 5 Hokkaido Consadole Sapporo(279)
ERROR: single positional indexer is out-of-bounds

2022-08-07 06:00:00-03:00: FC Tokyo(292) 0 x 2 Shimizu S-Pulse(283)
ERROR: single positional indexer is out-of-bounds

2022-08-13 06:30:00-03:00: Iwata(280) 0 x 6 Urawa Reds(287)
ERROR: single positional indexer is out-of-bounds

2022-08-13 02:00:00-03:00: Hokkaido Consadole Sapporo(279) 0 x 2 Vissel Kobe(289)
ERROR: single positional indexer is out-of-bounds

2022-08-14 07:00:00-03:00: Gamba Osaka(29

In [13]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.id AS league_id, l.name AS league, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE m.season = 2022 and league_id = 72 ORDER BY m.date ASC")



In [14]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league_id,league,home_team,away_team,home_score,away_score,home_odds,away_odds,draw_odds
245,838866,2022-08-19 21:30:00,2022,72,Serie B (Brazil),Nautico,Vila Nova FC,1,2,,,
246,838864,2022-08-20 11:00:00,2022,72,Serie B (Brazil),Ponte Preta,Guarani,1,0,,,
247,838873,2022-08-20 16:30:00,2022,72,Serie B (Brazil),Chapecoense-SC,Brusque,1,0,,,
248,838871,2022-08-20 19:00:00,2022,72,Serie B (Brazil),Sampaio Correa,CRB,1,2,,,
249,838870,2022-08-21 16:00:00,2022,72,Serie B (Brazil),Gremio,Cruzeiro,2,2,,,
