In [57]:
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime
from config import api_football_key, conn_host, conn_database, conn_user, conn_password
import os

In [58]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [59]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [60]:
league_id = 253 # Choose the league id
start_season = 2014
end_season = 2022

In [61]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (l.id = {league_id} AND m.home_odds IS NOT NULL AND m.season >= {start_season} AND m.season <= {end_season}) ORDER BY m.date ASC")
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)



In [62]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,home_odds,away_odds,draw_odds,winner
2805,817001,2022-07-17 20:30:00,2022,Major League Soccer (USA),1613,2242,Columbus Crew,FC Cincinnati,2,0,1.81,4.02,3.92,H
2806,817002,2022-07-17 21:30:00,2022,Major League Soccer (USA),9569,1616,Nashville SC,Los Angeles FC,1,2,2.46,2.81,3.39,A
2807,817003,2022-07-17 22:30:00,2022,Major League Soccer (USA),1606,1611,Real Salt Lake,Sporting Kansas City,3,0,1.78,4.48,3.67,H
2808,817004,2022-07-17 22:30:00,2022,Major League Soccer (USA),1596,1600,San Jose Earthquakes,Houston Dynamo,1,2,2.16,3.04,3.77,A
2809,817005,2022-07-17 23:30:00,2022,Major League Soccer (USA),1617,1603,Portland Timbers,Vancouver Whitecaps,1,1,1.66,4.74,4.1,D


In [63]:
n_last_games = 5

In [70]:
def get_games_results(games, cenario):
    loser = 'A' if cenario == 'H' else 'H'
    return len(games.loc[games['winner'] == cenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_goals_mean(games, team_id, cenario):
    games = games.iloc[-n_last_games:,:]
    
    home_games = games.loc[(games['home_id'] == team_id)]
    away_games = games.loc[(games['away_id'] == team_id)]
    total_games = len(home_games.index) + len(away_games.index)
    
    home_scored_goals = home_games['home_score'].sum()
    away_scored_goals = away_games['away_score'].sum()
    total_scored_goals = home_scored_goals + away_scored_goals
    
    home_conceded_goals = home_games['away_score'].sum()
    away_condeded_goals = away_games['home_score'].sum()
    total_conceded_goals = home_conceded_goals + away_condeded_goals
    
    return_list = [total_scored_goals / total_games, total_conceded_goals / total_games]
    if cenario == 'H':
        return_list.extend([home_scored_goals / len(home_games.index), home_conceded_goals / len(home_games.index)])
    else:
        return_list.extend([away_scored_goals / len(away_games.index), away_condeded_goals / len(away_games.index)])
    
    return return_list
    

def get_team_previous_games_stats(team_id, season, game_date, cenario):
    previous_games = fixtures_df.loc[((fixtures_df['home_id'] == team_id) | (fixtures_df['away_id'] == team_id)) & (fixtures_df['date'] < game_date) & (fixtures_df['season'] == season)]
    home_games = previous_games.loc[(previous_games['home_id'] == team_id)]
    away_games = previous_games.loc[(previous_games['away_id'] == team_id)]
    
    total_games = len(home_games.index) + len(away_games.index)
    if total_games < 10 or (len(home_games.index) < 5 and cenario == 'H') or (len(away_games.index) < 5 and cenario == 'A'):
        return
    
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    if cenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_games.index)
        ha_draw_pct = home_draws * 100 / len(home_games.index)
        ha_loss_pct = home_losses * 100 / len(home_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_games.index)
        ha_draw_pct = away_draws * 100 / len(away_games.index)
        ha_loss_pct = away_losses * 100 / len(away_games.index)
        
    scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean = get_goals_mean(previous_games, team_id, cenario)
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean]
        

In [71]:
data_model = []
for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    home_stats = get_team_previous_games_stats(game['home_id'], game['season'], game['date'], 'H')
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_id'], game['season'], game['date'], 'A')
    if not away_stats:
        continue
        
    data_model.append([game['id'], game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner']])

2809/2810
1617 2022 2022-07-17 23:30:00 H 7 11
1603 2022 2022-07-17 23:30:00 A 10 9


In [72]:
print(data_model[100])

[491711, Timestamp('2014-08-30 18:00:00'), 2014, 'Toronto FC', 'New England Revolution', 1.99, 3.68, 3.43, 48.484848484848484, 40.90909090909091, 22.727272727272727, 36.36363636363637, 50.0, 10.0, 40.0, 1.6, 2.2, 2.0, 2.0, 42.028985507246375, 39.130434782608695, 8.695652173913043, 52.17391304347826, 25.0, 8.333333333333334, 66.66666666666667, 1.4, 1.0, 1.0, 2.0, 'A']


In [73]:
columns = ['GAME_ID', 'GAME_DATE', 'SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_ODDS', 'AWAY_ODDS', 'DRAW_ODDS',
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_SCORED_LAST_{n_last_games}', f'HOME_CONCEDED_LAST_{n_last_games}', f'HOME_HOME_SCORED_LAST_{n_last_games}', f'HOME_HOME_CONCEDED_LAST_{n_last_games}',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_SCORED_LAST_{n_last_games}', f'AWAY_CONCEDED_LAST_{n_last_games}', f'AWAY_AWAY_SCORED_LAST_{n_last_games}', f'AWAY_AWAY_CONCEDED_LAST_{n_last_games}',
           'OUTCOME']
data_df = pd.DataFrame(data_model, columns=columns)

In [74]:
data_df.tail()

Unnamed: 0,GAME_ID,GAME_DATE,SEASON,HOME_TEAM,AWAY_TEAM,HOME_ODDS,AWAY_ODDS,DRAW_ODDS,HOME_PTS_PCT,HOME_WIN_PCT,...,AWAY_DRAW_PCT,AWAY_LOSS_PCT,AWAY_AWAY_WIN_PCT,AWAY_AWAY_DRAW_PCT,AWAY_AWAY_LOSS_PCT,AWAY_SCORED_LAST_5,AWAY_CONCEDED_LAST_5,AWAY_AWAY_SCORED_LAST_5,AWAY_AWAY_CONCEDED_LAST_5,OUTCOME
1647,817001,2022-07-17 20:30:00,2022,Columbus Crew,FC Cincinnati,1.81,4.02,3.92,48.148148,33.333333,...,35.294118,35.294118,37.5,37.5,25.0,2.0,2.0,1.5,1.5,H
1648,817002,2022-07-17 21:30:00,2022,Nashville SC,Los Angeles FC,2.46,2.81,3.39,50.0,40.0,...,17.647059,17.647059,42.857143,14.285714,42.857143,1.8,1.0,0.5,1.0,A
1649,817003,2022-07-17 22:30:00,2022,Real Salt Lake,Sporting Kansas City,1.78,4.48,3.67,47.368421,36.842105,...,23.809524,52.380952,16.666667,25.0,58.333333,1.0,1.4,1.25,1.5,H
1650,817004,2022-07-17 22:30:00,2022,San Jose Earthquakes,Houston Dynamo,2.16,3.04,3.77,38.888889,27.777778,...,23.529412,41.176471,28.571429,0.0,71.428571,1.2,1.8,0.5,2.5,A
1651,817005,2022-07-17 23:30:00,2022,Portland Timbers,Vancouver Whitecaps,1.66,4.74,4.1,40.740741,27.777778,...,21.052632,42.105263,22.222222,11.111111,66.666667,1.2,1.0,2.0,1.0,D


In [75]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [76]:
parse_df_to_csv(data_df, f'leagues/{league_id}/data', '{}-{}.csv'.format(start_season, end_season))