In [1]:
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime
from config import api_football_key, conn_host, conn_database, conn_user, conn_password

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [15]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [16]:
fixtures_df = execute_query(f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE m.season > 2013 AND m.season < 2022 ORDER BY m.date ASC")
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)



In [17]:
fixtures_df.head()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,winner
0,190447,2014-04-19 18:30:00,2014,Serie A (Brazil),124,137,Fluminense,Figueirense,3,0,H
1,190448,2014-04-19 18:30:00,2014,Serie A (Brazil),119,136,Internacional,Vitoria,1,0,H
2,190449,2014-04-19 21:00:00,2014,Serie A (Brazil),132,147,Chapecoense-SC,Coritiba,0,0,D
3,190450,2014-04-20 16:00:00,2014,Serie A (Brazil),126,120,Sao Paulo,Botafogo RJ,3,0,H
4,190451,2014-04-20 16:00:00,2014,Serie A (Brazil),134,130,Athletico-PR,Gremio,1,0,H


In [58]:
data_model = []
n_last_games = 5

In [59]:
def get_games_results(games, cenario):
    loser = 'A' if cenario == 'H' else 'H'
    return len(games.loc[games['winner'] == cenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_goals_mean(games, team_id, cenario):
    games = games.iloc[-n_last_games:,:]
    
    home_games = games.loc[(games['home_id'] == team_id)]
    away_games = games.loc[(games['away_id'] == team_id)]
    total_games = len(home_games.index) + len(away_games.index)
    
    home_scored_goals = home_games['home_score'].sum()
    away_scored_goals = away_games['away_score'].sum()
    total_scored_goals = home_scored_goals + away_scored_goals
    
    home_conceded_goals = home_games['away_score'].sum()
    away_condeded_goals = away_games['home_score'].sum()
    total_conceded_goals = home_conceded_goals + away_condeded_goals
    
    return_list = [total_scored_goals / total_games, total_conceded_goals / total_games]
    if cenario == 'H':
        return_list.extend([home_scored_goals / len(home_games.index), home_conceded_goals / len(home_games.index)])
    else:
        return_list.extend([away_scored_goals / len(away_games.index), away_condeded_goals / len(away_games.index)])
    
    return return_list
    

def get_team_previous_games_stats(team_id, season, game_date, cenario):
    previous_games = fixtures_df.loc[((fixtures_df['home_id'] == team_id) | (fixtures_df['away_id'] == team_id)) & (fixtures_df['date'] < game_date) & (fixtures_df['season'] == season)]
    home_games = previous_games.loc[(previous_games['home_id'] == team_id)]
    away_games = previous_games.loc[(previous_games['away_id'] == team_id)]
    
    total_games = len(home_games.index) + len(away_games.index)
    if total_games < 10:
        return
    
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    if cenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_games.index)
        ha_draw_pct = home_draws * 100 / len(home_games.index)
        ha_loss_pct = home_losses * 100 / len(home_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_games.index)
        ha_draw_pct = away_draws * 100 / len(away_games.index)
        ha_loss_pct = away_losses * 100 / len(away_games.index)
        
    scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean = get_goals_mean(previous_games, team_id, cenario)
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, scored_mean, conceded_mean, ha_scored_mean, ha_conceded_mean]
        

In [60]:
for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    home_stats = get_team_previous_games_stats(game['home_id'], game['season'], game['date'], 'H')
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_id'], game['season'], game['date'], 'A')
    if not away_stats:
        continue
        
    data_model.append([game['id'], game['date']] + home_stats + away_stats)

3039/3040


In [61]:
print(data_model[100])

[190648, Timestamp('2014-09-13 18:30:00'), 33.333333333333336, 25.0, 25.0, 50.0, 40.0, 30.0, 30.0, 0.6, 1.6, 0.5, 0.0, 51.666666666666664, 45.0, 20.0, 35.0, 20.0, 20.0, 60.0, 1.4, 1.6, 0.0, 3.0]


In [63]:
columns = ['GAME_ID', 'GAME_DATE', 
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_SCORED_LAST_{n_last_games}', f'HOME_CONCEDED_LAST_{n_last_games}', f'HOME_HOME_SCORED_LAST_{n_last_games}', f'HOME_HOME_CONCEDED_LAST_{n_last_games}',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_SCORED_LAST_{n_last_games}', f'AWAY_CONCEDED_LAST_{n_last_games}', f'AWAY_AWAY_SCORED_LAST_{n_last_games}', f'AWAY_AWAY_CONCEDED_LAST_{n_last_games}']
data_df = pd.DataFrame(data_model, columns=columns)

In [64]:
data_df.head()

Unnamed: 0,GAME_ID,GAME_DATE,HOME_PTS_PCT,HOME_WIN_PCT,HOME_DRAW_PCT,HOME_LOSS_PCT,HOME_HOME_WIN_PCT,HOME_HOME_DRAW_PCT,HOME_HOME_LOSS_PCT,HOME_SCORED_LAST_5,...,AWAY_WIN_PCT,AWAY_DRAW_PCT,AWAY_LOSS_PCT,AWAY_AWAY_WIN_PCT,AWAY_AWAY_DRAW_PCT,AWAY_AWAY_LOSS_PCT,AWAY_SCORED_LAST_5,AWAY_CONCEDED_LAST_5,AWAY_AWAY_SCORED_LAST_5,AWAY_AWAY_CONCEDED_LAST_5
0,190549,2014-07-19 18:30:00,23.333333,20.0,10.0,70.0,0.0,0.0,100.0,0.8,...,40.0,40.0,20.0,20.0,40.0,40.0,0.4,0.4,0.0,0.5
1,190550,2014-07-19 21:00:00,30.0,20.0,30.0,50.0,33.333333,66.666667,0.0,1.0,...,10.0,40.0,50.0,0.0,40.0,60.0,0.8,1.2,0.0,1.5
2,190551,2014-07-20 16:00:00,43.333333,40.0,10.0,50.0,50.0,0.0,50.0,0.2,...,70.0,10.0,20.0,60.0,0.0,40.0,2.2,0.6,1.5,1.0
3,190552,2014-07-20 16:00:00,23.333333,10.0,40.0,50.0,0.0,25.0,75.0,0.8,...,50.0,40.0,10.0,50.0,50.0,0.0,1.8,0.8,4.0,1.0
4,190553,2014-07-20 16:00:00,53.333333,40.0,40.0,20.0,80.0,0.0,20.0,1.2,...,10.0,40.0,50.0,0.0,25.0,75.0,0.6,1.4,0.0,1.5
