In [1]:
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime
from config import api_football_key, conn_host, conn_database, conn_user, conn_password
import os

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [3]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [4]:
league_id = 39 # Choose the league id
start_season = 2012
end_season = 2022

In [5]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (l.id = {league_id} AND m.home_odds IS NOT NULL AND m.season >= {start_season} AND m.season <= {end_season}) ORDER BY m.date ASC")
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)



In [6]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,home_odds,away_odds,draw_odds,winner
3691,710931,2022-05-22 12:00:00,2021,Premier League (England),52,33,Crystal Palace,Manchester Utd,1,0,2.81,2.54,3.53,H
3692,710932,2022-05-22 12:00:00,2021,Premier League (England),46,41,Leicester,Southampton,4,1,1.72,4.56,4.32,H
3693,710933,2022-05-22 12:00:00,2021,Premier League (England),40,39,Liverpool,Wolves,3,1,1.15,18.58,9.3,H
3694,710934,2022-05-22 12:00:00,2021,Premier League (England),50,66,Manchester City,Aston Villa,3,2,1.18,16.41,8.32,H
3695,710935,2022-05-22 12:00:00,2021,Premier League (England),71,47,Norwich,Tottenham,0,5,10.07,1.32,5.64,A


In [7]:
n_last_games = 5

In [8]:
def get_games_results(games, cenario):
    loser = 'A' if cenario == 'H' else 'H'
    return len(games.loc[games['winner'] == cenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_goals_mean(games, team_id, cenario):
    games = games.iloc[-n_last_games:,:]
    
    home_games = games.loc[(games['home_id'] == team_id)]
    away_games = games.loc[(games['away_id'] == team_id)]
    total_games = len(home_games.index) + len(away_games.index)
    
    home_scored_goals = home_games['home_score'].sum()
    away_scored_goals = away_games['away_score'].sum()
    total_scored_goals = home_scored_goals + away_scored_goals
    
    home_conceded_goals = home_games['away_score'].sum()
    away_condeded_goals = away_games['home_score'].sum()
    total_conceded_goals = home_conceded_goals + away_condeded_goals
    
    return_list = [total_scored_goals / total_games, total_conceded_goals / total_games]
    if cenario == 'H':
        return_list.extend([home_scored_goals / len(home_games.index), home_conceded_goals / len(home_games.index)])
    else:
        return_list.extend([away_scored_goals / len(away_games.index), away_condeded_goals / len(away_games.index)])
    
    return return_list
    

def get_team_previous_games_stats(team_id, season, game_date, cenario):
    previous_games = fixtures_df.loc[((fixtures_df['home_id'] == team_id) | (fixtures_df['away_id'] == team_id)) & (fixtures_df['date'] < game_date) & (fixtures_df['season'] == season)]
    home_games = previous_games.loc[(previous_games['home_id'] == team_id)]
    away_games = previous_games.loc[(previous_games['away_id'] == team_id)]
    
    total_games = len(home_games.index) + len(away_games.index)
    if total_games < 10:
        return
    
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    if cenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_games.index)
        ha_draw_pct = home_draws * 100 / len(home_games.index)
        ha_loss_pct = home_losses * 100 / len(home_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_games.index)
        ha_draw_pct = away_draws * 100 / len(away_games.index)
        ha_loss_pct = away_losses * 100 / len(away_games.index)
        
    scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean = get_goals_mean(previous_games, team_id, cenario)
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean]
        

In [9]:
data_model = []
for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    home_stats = get_team_previous_games_stats(game['home_id'], game['season'], game['date'], 'H')
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_id'], game['season'], game['date'], 'A')
    if not away_stats:
        continue
        
    data_model.append([game['id'], game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner']])

3695/3696


In [10]:
print(data_model[100])

[193640, Timestamp('2013-01-01 12:00:00'), 2012, 'Tottenham', 'Reading', 1.33, 10.33, 4.99, 60.0, 55.0, 15.0, 30.0, 50.0, 30.0, 20.0, 1.6, 0.6, 0.5, 0.0, 21.666666666666668, 10.0, 35.0, 55.0, 0.0, 20.0, 80.0, 0.6, 1.8, 0.0, 2.0, 'H']


In [11]:
columns = ['GAME_ID', 'GAME_DATE', 'SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_ODDS', 'AWAY_ODDS', 'DRAW_ODDS',
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_SCORED_LAST_{n_last_games}', f'HOME_CONCEDED_LAST_{n_last_games}', f'HOME_HOME_SCORED_LAST_{n_last_games}', f'HOME_HOME_CONCEDED_LAST_{n_last_games}',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_SCORED_LAST_{n_last_games}', f'AWAY_CONCEDED_LAST_{n_last_games}', f'AWAY_AWAY_SCORED_LAST_{n_last_games}', f'AWAY_AWAY_CONCEDED_LAST_{n_last_games}',
           'OUTCOME']
data_df = pd.DataFrame(data_model, columns=columns)

In [12]:
data_df.tail()

Unnamed: 0,GAME_ID,GAME_DATE,SEASON,HOME_TEAM,AWAY_TEAM,HOME_ODDS,AWAY_ODDS,DRAW_ODDS,HOME_PTS_PCT,HOME_WIN_PCT,...,AWAY_DRAW_PCT,AWAY_LOSS_PCT,AWAY_AWAY_WIN_PCT,AWAY_AWAY_DRAW_PCT,AWAY_AWAY_LOSS_PCT,AWAY_SCORED_LAST_5,AWAY_CONCEDED_LAST_5,AWAY_AWAY_SCORED_LAST_5,AWAY_AWAY_CONCEDED_LAST_5,OUTCOME
2690,710931,2022-05-22 12:00:00,2021,Crystal Palace,Manchester Utd,2.81,2.54,3.53,40.540541,27.027027,...,27.027027,29.72973,33.333333,27.777778,38.888889,1.0,2.4,0.333333,3.666667,H
2691,710932,2022-05-22 12:00:00,2021,Leicester,Southampton,1.72,4.56,4.32,44.144144,35.135135,...,35.135135,40.540541,16.666667,33.333333,50.0,0.8,2.2,0.666667,2.333333,H
2692,710933,2022-05-22 12:00:00,2021,Liverpool,Wolves,1.15,18.58,9.3,80.18018,72.972973,...,16.216216,43.243243,44.444444,16.666667,38.888889,0.8,2.4,1.0,1.5,H
2693,710934,2022-05-22 12:00:00,2021,Manchester City,Aston Villa,1.18,16.41,8.32,81.081081,75.675676,...,16.216216,48.648649,38.888889,5.555556,55.555556,1.6,1.0,3.0,1.0,H
2694,710935,2022-05-22 12:00:00,2021,Norwich,Tottenham,10.07,1.32,5.64,19.81982,13.513514,...,13.513514,29.72973,44.444444,22.222222,33.333333,1.6,0.4,0.5,0.5,A


In [13]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [14]:
parse_df_to_csv(data_df, f'leagues/{league_id}/data', '{}-{}.csv'.format(start_season, end_season))