In [31]:
import sys  
sys.path.insert(0, './machine_learning_models')

from logistic_regression import logistic_regression
from stochastic_gradient_descent import stochastic_gradient_descent
from knn import knn
from kernel_svm import kernel_svm
from naive_bayes import naive_bayes
from random_forest import random_forest
from voting_classifier import voting_classifier

import numpy as np
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime, timedelta
from config import api_football_key, conn_host, conn_database, conn_user, conn_password
import os
from termcolor import colored
import pickle
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from dateutil import tz
from joblib import load
import warnings
warnings.filterwarnings('ignore')

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [3]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [4]:
leagues = [
    {'league_id': 71, 'country': 'brazil', 'league': 'serie-a'},
    {'league_id': 72, 'country': 'brazil', 'league': 'serie-b'},
    {'league_id': 253, 'country': 'usa', 'league': 'mls'},
    {'league_id': 128, 'country': 'argentina', 'league': 'liga-profesional'},
    {'league_id': 98, 'country': 'japan', 'league': 'j1-league'},
    {'league_id': 40, 'country': 'england', 'league': 'championship'},
    {'league_id': 39, 'country': 'england', 'league': 'premier-league'},
    {'league_id': 78, 'country': 'germany', 'league': 'bundesliga'},
    {'league_id': 140, 'country': 'spain', 'league': 'laliga'},
    {'league_id': 61, 'country': 'france', 'league': 'ligue-1'},
]

In [33]:
current_season = 2022

league_selected_index = -1
league_id, country, league = leagues[league_selected_index]['league_id'], leagues[league_selected_index]['country'], leagues[league_selected_index]['league']

n_last_games = 5
now = time.time()
now_datetime = datetime.today()

min_threshold = 2
max_threshold = 10

draw_threshold = 0.025
min_score_diff_threshold = 0.4

from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/Sao_Paulo')

In [6]:
teams = execute_query("SELECT * FROM teams")



In [7]:
def get_league_season_fixtures(season):
    headers = {
        'X-RapidAPI-Key': api_football_key,
        'X-RapidAPI-Host': 'api-football-v1.p.rapidapi.com'
    }
    response = requests.get(f"https://api-football-v1.p.rapidapi.com/v3/fixtures?league={league_id}&season={season}", headers=headers)
    response_parsed = json.loads(response.text)
    return response_parsed['response']

In [8]:
def add_match_info_to_db(fixture):
    fixture_id, league_id, league_name, fixture_date, season, home_id, home_name, away_id, away_name, home_score, away_score = fixture['fixture']['id'], fixture['league']['id'], f"{fixture['league']['name']} ({fixture['league']['country']})", datetime.utcfromtimestamp(fixture['fixture']['timestamp']).replace(tzinfo=from_zone).astimezone(to_zone), fixture['league']['season'], fixture['teams']['home']['id'], fixture['teams']['home']['name'], fixture['teams']['away']['id'], fixture['teams']['away']['name'], fixture['goals']['home'], fixture['goals']['away']
    fixture_date_converted = fixture_date.strftime('%Y-%m-%d %H:%M:%S')
    execute_query(f"INSERT IGNORE INTO leagues (id, name) VALUES ({league_id}, '{league_name}')", False)
    execute_query(f"INSERT IGNORE INTO teams (id, name) VALUES ({home_id}, '{home_name}')", False)
    execute_query(f"INSERT IGNORE INTO teams (id, name) VALUES ({away_id}, '{away_name}')", False)
    execute_query(f"INSERT IGNORE INTO matches (id, date, league_id, season, home_id, away_id, home_score, away_score) VALUES ({fixture_id}, '{fixture_date_converted}', {league_id}, {season}, {home_id}, {away_id}, '{home_score}', '{away_score}')", False)

In [9]:
for season in range(current_season, current_season+1):
    fixtures_resp = get_league_season_fixtures(season)
    fixtures = [fixture for fixture in fixtures_resp if fixture['fixture']['timestamp'] < now]
    if season == current_season:
        next_fixtures = [fixture for fixture in fixtures_resp if fixture['fixture']['timestamp'] >= now and fixture['fixture']['timestamp'] <= (now + 24*60*60)]
        next_fixtures = sorted(next_fixtures, key = lambda x: x['fixture']['timestamp'])
        
    for index, fixture in enumerate(fixtures):
        clear_output(wait=True)
        print(f"Loading fixtures for the {season} season: {index}/{len(fixtures)}")
        add_match_info_to_db(fixture)

Loading fixtures for the 2022 season: 260/261


In [15]:
query = f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds, " + \
                            "m.home_shots_on_goal, m.home_shots_off_goal, m.home_total_shots, m.home_blocked_shots, m.home_shots_inside_box, m.home_shots_outside_box, m.home_fouls, m.home_corners, m.home_offsides, m.home_possession, m.home_yellow_cards, m.home_red_cards, m.home_saves, m.home_total_passes, m.home_passes_accurate, m.home_passes_pct, " + \
                            "m.away_shots_on_goal, m.away_shots_off_goal, m.away_total_shots, m.away_blocked_shots, m.away_shots_inside_box, m.away_shots_outside_box, m.away_fouls, m.away_corners, m.away_offsides, m.away_possession, m.away_yellow_cards, m.away_red_cards, m.away_saves, m.away_total_passes, m.away_passes_accurate, m.away_passes_pct " + \
                            f"FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (l.id = {league_id} AND m.season = {current_season}) ORDER BY m.date ASC"
fixtures_df = execute_query(query)
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)



In [16]:
fixtures_df.head()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,...,away_corners,away_offsides,away_possession,away_yellow_cards,away_red_cards,away_saves,away_total_passes,away_passes_accurate,away_passes_pct,winner
0,874671,2022-07-29 16:00:00,2022,Championship (England),37,44,Huddersfield,Burnley,0,1,...,5.0,3.0,69.0,2.0,0.0,0.0,599.0,506.0,84.0,A
1,874672,2022-07-30 11:00:00,2022,Championship (England),67,72,Blackburn,QPR,1,0,...,0.0,3.0,50.0,1.0,0.0,3.0,529.0,430.0,81.0,H
2,874673,2022-07-30 11:00:00,2022,Championship (England),1356,53,Blackpool,Reading,1,0,...,10.0,1.0,52.0,0.0,0.0,4.0,422.0,321.0,76.0,H
3,874674,2022-07-30 11:00:00,2022,Championship (England),43,71,Cardiff,Norwich,1,0,...,7.0,2.0,55.0,2.0,1.0,1.0,432.0,336.0,78.0,H
4,874675,2022-07-30 11:00:00,2022,Championship (England),64,56,Hull,Bristol City,2,1,...,3.0,4.0,45.0,0.0,0.0,2.0,382.0,306.0,80.0,H


In [22]:
def get_team_previous_games(team_id, game_date, season):
    home_previous_games = fixtures_df.loc[(fixtures_df['home_id'] == team_id) & (fixtures_df['date'] < game_date)]
    away_previous_games = fixtures_df.loc[(fixtures_df['away_id'] == team_id) & (fixtures_df['date'] < game_date)]
    
    if len(home_previous_games.index) == 0 or len(away_previous_games.index) == 0:
        return None
    
    home_previous_games.rename(columns = {'home_id': 'team_id', 'home_team': 'team_name',
       'home_score': 'team_score', 'home_shots_on_goal': 'team_shots_on_goal', 'home_shots_off_goal': 'team_shots_off_goal', 'home_total_shots': 'team_total_shots', 'home_blocked_shots': 'team_blocked_shots',
       'home_shots_inside_box': 'team_shots_inside_box', 'home_shots_outside_box': 'team_shots_outside_box', 'home_fouls': 'team_fouls', 'home_corners': 'team_corners', 'home_offsides': 'team_offsides',
       'home_possession': 'team_possession', 'home_yellow_cards': 'team_yellow_cards', 'home_red_cards': 'team_red_cards', 'home_saves': 'team_saves', 'home_total_passes': 'team_total_passes',
       'home_passes_accurate': 'team_passes_accurate', 'home_passes_pct': 'team_passes_pct',
                                          
       'away_id': 'opp_id', 'away_team': 'opp_name', 
       'away_score': 'opp_score', 'away_shots_on_goal': 'opp_shots_on_goal', 'away_shots_off_goal': 'opp_shots_off_goal', 'away_total_shots': 'opp_total_shots', 'away_blocked_shots': 'opp_blocked_shots',
       'away_shots_inside_box': 'opp_shots_inside_box', 'away_shots_outside_box': 'opp_shots_outside_box', 'away_fouls': 'opp_fouls', 'away_corners': 'opp_corners', 'away_offsides': 'opp_offsides',
       'away_possession': 'opp_possession', 'away_yellow_cards': 'opp_yellow_cards', 'away_red_cards': 'opp_red_cards', 'away_saves': 'opp_saves', 'away_total_passes': 'opp_total_passes',
       'away_passes_accurate': 'opp_passes_accurate', 'away_passes_pct': 'opp_passes_pct',
                                          
       'home_odds': 'team_odds', 'away_odds': 'opp_odds'}, inplace=True)
    home_previous_games['scenario'] = 'H'
    
    away_previous_games.rename(columns = {'away_id': 'team_id', 'away_team': 'team_name',
       'away_score': 'team_score', 'away_shots_on_goal': 'team_shots_on_goal', 'away_shots_off_goal': 'team_shots_off_goal', 'away_total_shots': 'team_total_shots', 'away_blocked_shots': 'team_blocked_shots',
       'away_shots_inside_box': 'team_shots_inside_box', 'away_shots_outside_box': 'team_shots_outside_box', 'away_fouls': 'team_fouls', 'away_corners': 'team_corners', 'away_offsides': 'team_offsides',
       'away_possession': 'team_possession', 'away_yellow_cards': 'team_yellow_cards', 'away_red_cards': 'team_red_cards', 'away_saves': 'team_saves', 'away_total_passes': 'team_total_passes',
       'away_passes_accurate': 'team_passes_accurate', 'away_passes_pct': 'team_passes_pct',
                                          
       'home_id': 'opp_id', 'home_team': 'opp_name', 
       'home_score': 'opp_score', 'home_shots_on_goal': 'opp_shots_on_goal', 'home_shots_off_goal': 'opp_shots_off_goal', 'home_total_shots': 'opp_total_shots', 'home_blocked_shots': 'opp_blocked_shots',
       'home_shots_inside_box': 'opp_shots_inside_box', 'home_shots_outside_box': 'opp_shots_outside_box', 'home_fouls': 'opp_fouls', 'home_corners': 'opp_corners', 'home_offsides': 'opp_offsides',
       'home_possession': 'opp_possession', 'home_yellow_cards': 'opp_yellow_cards', 'home_red_cards': 'opp_red_cards', 'home_saves': 'opp_saves', 'home_total_passes': 'opp_total_passes',
       'home_passes_accurate': 'opp_passes_accurate', 'home_passes_pct': 'opp_passes_pct',
                                          
       'home_odds': 'opp_odds', 'away_odds': 'team_odds'}, inplace=True)
    away_previous_games['scenario'] = 'A'
    
    previous_games = pd.concat([home_previous_games, away_previous_games], axis=0, ignore_index=True)
    previous_games.sort_values('date', inplace=True)
    
    previous_season_games = previous_games.loc[previous_games['season'] == season]
    home_previous_season_games = home_previous_games.loc[home_previous_games['season'] == season]
    away_previous_season_games = away_previous_games.loc[away_previous_games['season'] == season]
    
    return previous_season_games, home_previous_season_games, away_previous_season_games

def get_games_results(games, scenario):
    loser = 'A' if scenario == 'H' else 'H'
    return len(games.loc[games['winner'] == scenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_stats_mean(games, team_id, scenario):
    games = games.iloc[-n_last_games:,:]
    
    team_stats = [games['team_score'].mean(), games['opp_score'].mean(), games['team_shots_on_goal'].mean(), games['team_shots_off_goal'].mean(),
                 games['team_total_shots'].mean(), games['team_blocked_shots'].mean(), games['team_shots_inside_box'].mean(),
                 games['team_shots_outside_box'].mean(), games['team_fouls'].mean(), games['team_corners'].mean(),
                 games['team_offsides'].mean(), games['team_possession'].mean(), games['team_yellow_cards'].mean(),
                 games['team_red_cards'].mean(), games['team_saves'].mean(), games['team_total_passes'].mean(),
                 games['team_passes_accurate'].mean(), games['team_passes_pct'].mean()]
#     opp_stats = [games['opp_shots_on_goal'].mean(), games['opp_shots_off_goal'].mean(),
#                  games['opp_total_shots'].mean(), games['opp_blocked_shots'].mean(), games['opp_shots_inside_box'].mean(),
#                  games['opp_shots_outside_box'].mean(), games['opp_fouls'].mean(), games['opp_corners'].mean(),
#                  games['opp_offsides'].mean(), games['opp_possession'].mean(), games['opp_yellow_cards'].mean(),
#                  games['opp_red_cards'].mean(), games['opp_saves'].mean(), games['opp_total_passes'].mean(),
#                  games['opp_passes_accurate'].mean(), games['opp_passes_pct'].mean()]
    
    return team_stats

def get_historical_stats(games, home_games, away_games):
    total_games = len(games.index)
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    return points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses
    

def get_team_previous_games_stats(team_id, season, game_date, scenario):
    response = get_team_previous_games(team_id, game_date, season)
    if not response: return None
    
    previous_season_games, home_previous_season_games, away_previous_season_games = response
    
    total_games = len(previous_season_games.index)
    if total_games < 10 or (len(home_previous_season_games.index) < 5 and scenario == 'H') or (len(away_previous_season_games.index) < 5 and scenario == 'A'):
        return
    
    points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses = get_historical_stats(previous_season_games, home_previous_season_games, away_previous_season_games)
    
    previous_last_games = previous_season_games.iloc[-n_last_games:,:]
    home_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'H']
    away_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'A']
    
    points_pct_last_games, win_pct_last_games, draw_pct_last_games, loss_pct_last_games, home_wins_last_games, home_draws_last_games, home_losses_last_games, away_wins_last_games, away_draws_last_games, away_losses_last_games = get_historical_stats(previous_last_games, home_last_games, away_last_games)
    
    if scenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_previous_season_games.index)
        ha_draw_pct = home_draws * 100 / len(home_previous_season_games.index)
        ha_loss_pct = home_losses * 100 / len(home_previous_season_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_previous_season_games.index)
        ha_draw_pct = away_draws * 100 / len(away_previous_season_games.index)
        ha_loss_pct = away_losses * 100 / len(away_previous_season_games.index)
        
    game_stats = get_stats_mean(previous_season_games, team_id, scenario)
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, win_pct_last_games, draw_pct_last_games, loss_pct_last_games] + game_stats

In [23]:
def get_betting_odds():
    months = dict(Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12)
    year = datetime.now().year
    base_url = f"https://www.oddsportal.com/soccer/{country}/{league}/"
    option = Options()
    option.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
    driver.get(base_url)
    time.sleep(5)
    
    element = driver.find_element('id', "tournamentTable")
    htmlContent = element.get_attribute('outerHTML')
    page_soup = soup(htmlContent, "html.parser")
    
    games = []
    
    trs = page_soup.findAll('tr')
    games_registered = 0

    for tr in trs:
        try:
            tr_class = tr.get('class')
            if tr_class == ['center', 'nob-border']:
                date_info_splitted = tr.contents[0].text.split(',')
                if len(date_info_splitted) == 1:
                    break
                date_text = date_info_splitted[1].strip()
            elif not tr_class or tr_class == ['odd']:
                unix_date = datetime(year, 
                                    months[date_text.split(' ')[1]], 
                                    int(date_text.split(' ')[0]), 
                                    hour=int(tr.contents[0].text.strip().split(':')[0]), 
                                    minute=int(tr.contents[0].text.strip().split(':')[1]), 
                                    second=0).replace(tzinfo=from_zone).astimezone(to_zone)
                home_team_string = tr.contents[1].text.split(' - ')[0].strip()
                away_team_string = tr.contents[1].text.split(' - ')[1].strip()
                home_team = teams.loc[teams['name'].str.lower() == home_team_string.lower()].iloc[0]
                away_team = teams.loc[teams['name'].str.lower() == away_team_string.lower()].iloc[0]
                game_parsed = {'date': unix_date,
                                  'home_id': home_team['id'], # Home Team Id
                                  'away_id': away_team['id'], # Away Team Id
                                  'home_name': home_team['name'], # Team A Name
                                  'away_name': away_team['name'], # Team B Name
                                  'home_odds': float(tr.contents[2].text),  # Team A Odds
                                  'draw_odds': float(tr.contents[3].text),  # Draw Odds
                                  'away_odd': float(tr.contents[4].text)}  # Team B Odds
                games.append(game_parsed)
        except Exception as e:
            print(e)
            continue
    
    driver.quit()
    return sorted(games, key=lambda d: d['date'])

In [24]:
games_odds = get_betting_odds()




  driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)


In [25]:
def get_match_with_odds(home_id, away_id):
    match = next(filter(lambda x: x['home_id'] == home_id and x['away_id'] == away_id, games_odds))
    return match['home_odds'], match['draw_odds'], match['away_odd'], match['date']

In [26]:
model = load(f"leagues/{league_id}/model.joblib")

In [47]:
parlay = []
parlay_odds = 1
for fixture in next_fixtures:
    fixture_id, league_id, league_name, fixture_date, season, home_id, home_name, away_id, away_name, home_score, away_score = fixture['fixture']['id'], fixture['league']['id'], f"{fixture['league']['name']} ({fixture['league']['country']})", fixture['fixture']['timestamp'], fixture['league']['season'], fixture['teams']['home']['id'], fixture['teams']['home']['name'], fixture['teams']['away']['id'], fixture['teams']['away']['name'], fixture['goals']['home'], fixture['goals']['away']
    fixture_date_converted = datetime.fromtimestamp(fixture_date)
    
    try:
        home_odds, draw_odds, away_odds, parsed_date = get_match_with_odds(home_id, away_id)
    except:
        continue
    
    home_stats = get_team_previous_games_stats(home_id, season, fixture_date_converted, 'H')
    if home_stats == None:
        continue
        
    away_stats = get_team_previous_games_stats(away_id, season, fixture_date_converted, 'A')
    if away_stats == None:
        continue
    
    game_stats = [home_stats + away_stats]
    
    predictions = model.predict(game_stats)
    
    home_score_pred, away_score_pred = predictions[0]
    
    prediction = 'D'
    selected_odds = draw_odds
    prediction_text = 'Draw'
    score_pred_diff = abs(home_score_pred - away_score_pred)
    if home_score_pred > away_score_pred and score_pred_diff > draw_threshold:
        prediction = 'H'
        selected_odds = home_odds
        prediction_text = home_name
    elif home_score_pred < away_score_pred and score_pred_diff > draw_threshold:
        prediction = 'A'
        selected_odds = away_odds
        prediction_text = away_name
        
    print('-'*10)
    print(f"{home_name} x {away_name}")
    if score_pred_diff > draw_threshold and score_pred_diff < min_score_diff_threshold:
        print(colored(f"BAD BET: {prediction_text} ({round(home_score_pred, 2)} x {round(away_score_pred, 2)})\n", 'red'))
    elif selected_odds < min_threshold or selected_odds > max_threshold:
        ml_good_bet_conditions_text = f"Good bet if {min_threshold} < odds < {max_threshold}"
        print(colored(f"BAD BET: {prediction_text} ({ml_good_bet_conditions_text})\n", 'red'))
    else:
        print(colored(f'GOOD BET: {prediction_text}\n', 'green'))
        

----------
Sunderland x West Brom
[31mBAD BET: Sunderland (1.38 x 1.11)
[0m
