In [1]:
import sys  
sys.path.insert(0, './machine_learning_models')

from logistic_regression import logistic_regression
from stochastic_gradient_descent import stochastic_gradient_descent
from knn import knn
from kernel_svm import kernel_svm
from naive_bayes import naive_bayes
from random_forest import random_forest
from voting_classifier import voting_classifier

import numpy as np
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime, timedelta
from config import api_football_key, conn_host, conn_database, conn_user, conn_password
import os
from termcolor import colored
import pickle
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from dateutil import tz

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [3]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [4]:
default_training_season = '2014-2021'
leagues = [
    {'league_id': 71, 'country': 'brazil', 'league': 'serie-a', 'models': ['svm']},
    {'league_id': 72, 'country': 'brazil', 'league': 'serie-b', 'models': ['svm', 'lr']},
    {'league_id': 253, 'country': 'usa', 'league': 'mls', 'models': ['svm', 'knn']},
    {'league_id': 128, 'country': 'argentina', 'league': 'liga-profesional', 'models': ['svm'], 'training_season': '2015-2021'},
    {'league_id': 98, 'country': 'japan', 'league': 'j1-league', 'models': ['rf', 'nb', 'knn']},
    {'league_id': 40, 'country': 'england', 'league': 'championship', 'models': ['svm', 'nb']},
]

In [41]:
current_season = 2022

league_selected_index = 0
league_id, country, league = leagues[league_selected_index]['league_id'], leagues[league_selected_index]['country'], leagues[league_selected_index]['league']

n_last_games = 5
now = time.time()
now_datetime = datetime.today()
min_threshold = 1.75
max_threshold = 10

from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/Sao_Paulo')

In [42]:
teams = execute_query("SELECT * FROM teams")



In [43]:
def get_league_season_fixtures(season):
    headers = {
        'X-RapidAPI-Key': api_football_key,
        'X-RapidAPI-Host': 'api-football-v1.p.rapidapi.com'
    }
    response = requests.get(f"https://api-football-v1.p.rapidapi.com/v3/fixtures?league={league_id}&season={season}", headers=headers)
    response_parsed = json.loads(response.text)
    return response_parsed['response']

In [44]:
def add_match_info_to_db(fixture):
    fixture_id, league_id, league_name, fixture_date, season, home_id, home_name, away_id, away_name, home_score, away_score = fixture['fixture']['id'], fixture['league']['id'], f"{fixture['league']['name']} ({fixture['league']['country']})", datetime.utcfromtimestamp(fixture['fixture']['timestamp']).replace(tzinfo=from_zone).astimezone(to_zone), fixture['league']['season'], fixture['teams']['home']['id'], fixture['teams']['home']['name'], fixture['teams']['away']['id'], fixture['teams']['away']['name'], fixture['goals']['home'], fixture['goals']['away']
    fixture_date_converted = fixture_date.strftime('%Y-%m-%d %H:%M:%S')
    execute_query(f"INSERT IGNORE INTO leagues (id, name) VALUES ({league_id}, '{league_name}')", False)
    execute_query(f"INSERT IGNORE INTO teams (id, name) VALUES ({home_id}, '{home_name}')", False)
    execute_query(f"INSERT IGNORE INTO teams (id, name) VALUES ({away_id}, '{away_name}')", False)
    execute_query(f"INSERT IGNORE INTO matches (id, date, league_id, season, home_id, away_id, home_score, away_score) VALUES ({fixture_id}, '{fixture_date_converted}', {league_id}, {season}, {home_id}, {away_id}, '{home_score}', '{away_score}')", False)

In [45]:
for season in range(current_season, current_season+1):
    fixtures_resp = get_league_season_fixtures(season)
    fixtures = [fixture for fixture in fixtures_resp if fixture['fixture']['timestamp'] < now]
    if season == current_season:
        next_fixtures = [fixture for fixture in fixtures_resp if fixture['fixture']['timestamp'] >= now and fixture['fixture']['timestamp'] <= (now + 24*60*60)]
        next_fixtures = sorted(next_fixtures, key = lambda x: x['fixture']['timestamp'])
        
    for index, fixture in enumerate(fixtures):
        clear_output(wait=True)
        print(f"Loading fixtures for the {season} season: {index}/{len(fixtures)}")
        add_match_info_to_db(fixture)

Loading fixtures for the 2022 season: 279/280


In [46]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (m.season >= {current_season - 1} AND m.season <= {current_season} AND l.id = {league_id}) ORDER BY m.date ASC")
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)



In [47]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,home_odds,away_odds,draw_odds,winner
655,838270,2022-09-28 19:00:00,2022,Serie A (Brazil),147,129,Coritiba,Ceara,1,0,,,,H
656,838269,2022-09-28 21:00:00,2022,Serie A (Brazil),1193,125,Cuiaba,America MG,2,1,,,,H
657,838261,2022-09-28 21:45:00,2022,Serie A (Brazil),1062,121,Atletico-MG,Palmeiras,0,1,,,,A
658,838266,2022-09-28 21:45:00,2022,Serie A (Brazil),119,794,Internacional,Bragantino,0,0,,,,D
659,838268,2022-09-28 21:45:00,2022,Serie A (Brazil),151,120,Goias,Botafogo RJ,0,1,,,,A


In [48]:
def get_pred_odds(probs):
    return 1/probs[0], 1/probs[1], 1/probs[2] # Away, Draw, Home

def check_bet_worth(prediction, odds_home, odds_away, odds_draw, pred_odds_home, pred_odds_away, pred_odds_draw):
    return ((prediction == 'H' and (pred_odds_home < odds_home) and (odds_home > min_threshold and odds_home < max_threshold)) or 
            (prediction == 'A' and (pred_odds_away < odds_away) and (odds_away > min_threshold and odds_away < max_threshold)) or
            (prediction == 'D' and (pred_odds_draw < odds_draw) and (odds_draw > min_threshold and odds_draw < max_threshold)))

def get_games_results(games, cenario):
    loser = 'A' if cenario == 'H' else 'H'
    return len(games.loc[games['winner'] == cenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_goals_mean(games, team_id, cenario):
    games = games.iloc[-n_last_games:,:]
    
    home_games = games.loc[(games['home_id'] == team_id)]
    away_games = games.loc[(games['away_id'] == team_id)]
    total_games = len(home_games.index) + len(away_games.index)
    
    home_scored_goals = home_games['home_score'].sum()
    away_scored_goals = away_games['away_score'].sum()
    total_scored_goals = home_scored_goals + away_scored_goals
    
    home_conceded_goals = home_games['away_score'].sum()
    away_condeded_goals = away_games['home_score'].sum()
    total_conceded_goals = home_conceded_goals + away_condeded_goals
    
    return_list = [total_scored_goals / total_games, total_conceded_goals / total_games]
    if cenario == 'H':
        return_list.extend([home_scored_goals / len(home_games.index), home_conceded_goals / len(home_games.index)])
    else:
        return_list.extend([away_scored_goals / len(away_games.index), away_condeded_goals / len(away_games.index)])
    
    return return_list

def get_historical_stats(home_games, away_games):
    total_games = len(home_games.index) + len(away_games.index)
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    return points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses
    

def get_team_previous_games_stats(fixture_id, team_id, team_name, season, game_date, cenario):
    previous_games = fixtures_df.loc[((fixtures_df['home_id'] == team_id) | (fixtures_df['away_id'] == team_id)) & (fixtures_df['date'] < game_date) & (fixtures_df['season'] == season)]
    home_games = previous_games.loc[(previous_games['home_id'] == team_id)]
    away_games = previous_games.loc[(previous_games['away_id'] == team_id)]
    
    total_games = len(home_games.index) + len(away_games.index)
    if total_games < 10 or (len(home_games.index) < 5 and cenario == 'H') or (len(away_games.index) < 5 and cenario == 'A'):
        return
    
    points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses = get_historical_stats(home_games, away_games)
    
    previous_last_games = previous_games.iloc[-n_last_games:,:]
    home_last_games = previous_last_games.loc[(previous_last_games['home_id'] == team_id)]
    away_last_games = previous_last_games.loc[(previous_last_games['away_id'] == team_id)]
    
    points_pct_last_games, win_pct_last_games, draw_pct_last_games, loss_pct_last_games, home_wins_last_games, home_draws_last_games, home_losses_last_games, away_wins_last_games, away_draws_last_games, away_losses_last_games = get_historical_stats(home_last_games, away_last_games)
    
    if cenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_games.index)
        ha_draw_pct = home_draws * 100 / len(home_games.index)
        ha_loss_pct = home_losses * 100 / len(home_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_games.index)
        ha_draw_pct = away_draws * 100 / len(away_games.index)
        ha_loss_pct = away_losses * 100 / len(away_games.index)
        
    scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean = get_goals_mean(previous_games, team_id, cenario)
    
    return_array = [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean, win_pct_last_games, draw_pct_last_games, loss_pct_last_games]
    
    return return_array if not np.any(np.isnan(np.asarray(return_array))) else None
        

In [49]:
def train_lr_model(dataset):
    logisticRegression = logistic_regression(dataset)
    return logisticRegression[2]

def train_sgd_model(dataset):
    sgd = stochastic_gradient_descent(dataset)
    return sgd[2]

def train_knn_model(dataset):
    knn_model = knn(dataset)
    return knn_model[2]

def train_svm_model(dataset):
    svm = kernel_svm(dataset)
    return svm[2]

def train_nb_model(dataset):
    nb = naive_bayes(dataset)
    return nb[2]

def train_rf_model(dataset):
    rf = random_forest(dataset)
    return rf[2]

def train_voting_model(dataset, models):
    estimators = []
    
    if 'lr' in models:
        estimators.append(('lr', train_lr_model(dataset)))
    if 'svm' in models:
        estimators.append(('svm', train_svm_model(dataset)))
    if 'nb' in models:
        estimators.append(('nb', train_nb_model(dataset)))
    if 'sgd' in models:
        estimators.append(('sgd', train_sgd_model(dataset)))
    if 'knn' in models:
        estimators.append(('knn', train_knn_model(dataset)))
    if 'rf' in models:
        estimators.append(('rf', train_rf_model(dataset)))

    weights = [1]*len(estimators)
    
    vc = voting_classifier(dataset, estimators, weights)
    return vc[2]

In [50]:
def get_betting_odds():
    months = dict(Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12)
    year = datetime.now().year
    base_url = f"https://www.oddsportal.com/soccer/{country}/{league}/"
    option = Options()
    option.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
    driver.get(base_url)
    time.sleep(5)
    
    element = driver.find_element('id', "tournamentTable")
    htmlContent = element.get_attribute('outerHTML')
    page_soup = soup(htmlContent, "html.parser")
    
    games = []
    
    trs = page_soup.findAll('tr')
    games_registered = 0

    for tr in trs:
        try:
            tr_class = tr.get('class')
            if tr_class == ['center', 'nob-border']:
                date_info_splitted = tr.contents[0].text.split(',')
                if len(date_info_splitted) == 1:
                    break
                date_text = date_info_splitted[1].strip()
            elif not tr_class or tr_class == ['odd']:
                unix_date = datetime(year, 
                                    months[date_text.split(' ')[1]], 
                                    int(date_text.split(' ')[0]), 
                                    hour=int(tr.contents[0].text.strip().split(':')[0]), 
                                    minute=int(tr.contents[0].text.strip().split(':')[1]), 
                                    second=0).replace(tzinfo=from_zone).astimezone(to_zone)
                home_team_string = tr.contents[1].text.split(' - ')[0].strip()
                away_team_string = tr.contents[1].text.split(' - ')[1].strip()
                home_team = teams.loc[teams['name'].str.lower() == home_team_string.lower()].iloc[0]
                away_team = teams.loc[teams['name'].str.lower() == away_team_string.lower()].iloc[0]
                game_parsed = {'date': unix_date,
                                  'home_id': home_team['id'], # Home Team Id
                                  'away_id': away_team['id'], # Away Team Id
                                  'home_name': home_team['name'], # Team A Name
                                  'away_name': away_team['name'], # Team B Name
                                  'home_odds': float(tr.contents[2].text),  # Team A Odds
                                  'draw_odds': float(tr.contents[3].text),  # Draw Odds
                                  'away_odd': float(tr.contents[4].text)}  # Team B Odds
                games.append(game_parsed)
        except Exception as e:
            print(e)
            continue
    
    driver.quit()
    return sorted(games, key=lambda d: d['date'])

In [51]:
games_odds = get_betting_odds()




  driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)


In [52]:
def get_odds_based_on_prediction(prediction, odds_home, odds_away, odds_draw):
    if prediction == 'H':
        return round(odds_home, 2)
    elif prediction == 'A':
        return round(odds_away, 2)
    else:
        return round(odds_draw, 2)

In [53]:
def get_match_with_odds(home_id, away_id):
    match = next(filter(lambda x: x['home_id'] == home_id and x['away_id'] == away_id, games_odds))
    return match['home_odds'], match['draw_odds'], match['away_odd'], match['date']

In [54]:
data_model = []
for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
        
    home_stats = get_team_previous_games_stats(game['id'], game['home_id'], game['home_team'], game['season'], game['date'], 'H')
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['id'], game['away_id'], game['away_team'], game['season'], game['date'], 'A')
    if not away_stats:
        continue
        
    data_model.append([index, game['id'], game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner']])

659/660


In [55]:
columns = ['INDEX', 'GAME_ID', 'GAME_DATE', 'SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_ODDS', 'AWAY_ODDS', 'DRAW_ODDS',
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_SCORED_LAST_{n_last_games}', f'HOME_CONCEDED_LAST_{n_last_games}', f'HOME_HOME_SCORED_LAST_{n_last_games}', f'HOME_HOME_CONCEDED_LAST_{n_last_games}', f'HOME_WIN_PCT_{n_last_games}', f'HOME_DRAW_PCT_{n_last_games}', f'HOME_LOSS_PCT_{n_last_games}',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_SCORED_LAST_{n_last_games}', f'AWAY_CONCEDED_LAST_{n_last_games}', f'AWAY_AWAY_SCORED_LAST_{n_last_games}', f'AWAY_AWAY_CONCEDED_LAST_{n_last_games}', f'AWAY_WIN_PCT_{n_last_games}', f'AWAY_DRAW_PCT_{n_last_games}', f'AWAY_LOSS_PCT_{n_last_games}',
           'OUTCOME']
data_df = pd.DataFrame(data_model, columns=columns)

In [56]:
data_df.tail()

Unnamed: 0,INDEX,GAME_ID,GAME_DATE,SEASON,HOME_TEAM,AWAY_TEAM,HOME_ODDS,AWAY_ODDS,DRAW_ODDS,HOME_PTS_PCT,...,AWAY_AWAY_DRAW_PCT,AWAY_AWAY_LOSS_PCT,AWAY_SCORED_LAST_5,AWAY_CONCEDED_LAST_5,AWAY_AWAY_SCORED_LAST_5,AWAY_AWAY_CONCEDED_LAST_5,AWAY_WIN_PCT_5,AWAY_DRAW_PCT_5,AWAY_LOSS_PCT_5,OUTCOME
435,655,838270,2022-09-28 19:00:00,2022,Coritiba,Ceara,,,,35.802469,...,46.153846,23.076923,0.6,0.6,0.5,0.0,40.0,40.0,20.0,H
436,656,838269,2022-09-28 21:00:00,2022,Cuiaba,America MG,,,,32.098765,...,15.384615,61.538462,0.8,0.4,0.0,0.5,40.0,40.0,20.0,H
437,657,838261,2022-09-28 21:45:00,2022,Atletico-MG,Palmeiras,,,,49.382716,...,53.846154,0.0,1.0,0.8,1.5,1.5,20.0,80.0,0.0,A
438,658,838266,2022-09-28 21:45:00,2022,Internacional,Bragantino,,,,60.493827,...,38.461538,46.153846,0.8,1.0,0.5,1.0,0.0,80.0,20.0,D
439,659,838268,2022-09-28 21:45:00,2022,Goias,Botafogo RJ,,,,43.209877,...,23.076923,38.461538,1.4,0.8,2.5,1.5,40.0,40.0,20.0,A


In [57]:
from sklearn.preprocessing import StandardScaler

classifier = train_voting_model(data_df, leagues[league_selected_index]['models'])
X = data_df.iloc[:, 9:-1].values
sc = StandardScaler()
sc.fit(X)

StandardScaler()

In [58]:
parlay = []
parlay_odds = 1
for fixture in next_fixtures:
    fixture_id, league_id, league_name, fixture_date, season, home_id, home_name, away_id, away_name, home_score, away_score = fixture['fixture']['id'], fixture['league']['id'], f"{fixture['league']['name']} ({fixture['league']['country']})", fixture['fixture']['timestamp'], fixture['league']['season'], fixture['teams']['home']['id'], fixture['teams']['home']['name'], fixture['teams']['away']['id'], fixture['teams']['away']['name'], fixture['goals']['home'], fixture['goals']['away']
    fixture_date_converted = datetime.fromtimestamp(fixture_date)
    
    try:
        home_odds, draw_odds, away_odds, parsed_date = get_match_with_odds(home_id, away_id)
    except:
        continue
    
    home_stats = get_team_previous_games_stats(fixture_id, home_id, home_name, season, fixture_date_converted, 'H')
    if home_stats == None:
        continue
        
    away_stats = get_team_previous_games_stats(fixture_id, away_id, away_name, season, fixture_date_converted, 'A')
    if away_stats == None:
        continue
    
    game_stats = sc.transform([home_stats + away_stats])
    
    prediction = classifier.predict(game_stats)[0]
    probability = classifier.predict_proba(game_stats)[0,:]
    pred_odds_away, pred_odds_draw, pred_odds_home = get_pred_odds(probability)
    should_bet = check_bet_worth(prediction, home_odds, away_odds, draw_odds, pred_odds_home, pred_odds_away, pred_odds_draw)
    
    prediction_text = home_name if prediction == 'H' else away_name if prediction == 'A' else 'Draw'
    odds = home_odds if prediction == 'H' else away_odds if prediction == 'A' else draw_odds
    pred_odds = get_odds_based_on_prediction(prediction, pred_odds_home, pred_odds_away, pred_odds_draw)
    if should_bet:
        print(parsed_date)
        print(f'{home_name}: {home_odds}')
        print(f'Draw: {draw_odds}')
        print(f'{away_name}: {away_odds}')
        print(colored(f'GOOD BET: {prediction_text}\n', 'green'))
    else:
        parlay_odds *= odds
        parlay.append(f"{parsed_date}: {home_name} ({home_odds}) x ({away_odds}) {away_name}\n" +
                     colored(f"BET: {prediction_text}\n", 'green'))

if len(parlay) > 0 and parlay_odds >= 1.5:
    print('PARLAY: ')
    [print(f"{p}") for p in parlay]
    print(colored(f"PARLAY VALUE: {round(parlay_odds, 2)}", 'green'))
        

2022-10-01 15:00:00-03:00
Ceara: 2.03
Draw: 3.18
America Mineiro: 3.99
[32mGOOD BET: Ceara
[0m
2022-10-01 19:00:00-03:00
Goias: 3.16
Draw: 3.1
Fortaleza EC: 2.38
[32mGOOD BET: Goias
[0m
2022-10-01 19:00:00-03:00
Avai: 2.39
Draw: 3.09
Atletico Goianiense: 3.16
[32mGOOD BET: Avai
[0m
PARLAY: 
2022-10-01 15:00:00-03:00: Atletico-MG (1.75) x (5.1) Fluminense
[32mBET: Atletico-MG
[0m
2022-10-01 15:00:00-03:00: Internacional (1.71) x (5.37) Santos
[32mBET: Internacional
[0m
2022-10-01 19:00:00-03:00: Flamengo (1.45) x (7.05) RB Bragantino
[32mBET: Flamengo
[0m
2022-10-01 19:00:00-03:00: Atletico Paranaense (1.55) x (6.2) Juventude
[32mBET: Atletico Paranaense
[0m
2022-10-01 21:00:00-03:00: Corinthians (1.66) x (5.77) Cuiaba
[32mBET: Corinthians
[0m
[32mPARLAY VALUE: 11.16[0m
