In [34]:
import requests
import json
import time
import pandas as pd
import mysql.connector
from IPython.display import clear_output
from datetime import datetime
from config import api_football_key, conn_host, conn_database, conn_user, conn_password
import os
import warnings
warnings.filterwarnings('ignore')

In [35]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
    except Exception as e:
        print(e)
    db.close()
    return resp

In [36]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [59]:
league_id = 39 # Choose the league id
start_season = 2015
end_season = 2022

In [60]:
query = f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds, " + \
                            "m.home_shots_on_goal, m.home_shots_off_goal, m.home_total_shots, m.home_blocked_shots, m.home_shots_inside_box, m.home_shots_outside_box, m.home_fouls, m.home_corners, m.home_offsides, m.home_possession, m.home_yellow_cards, m.home_red_cards, m.home_saves, m.home_total_passes, m.home_passes_accurate, m.home_passes_pct, " + \
                            "m.away_shots_on_goal, m.away_shots_off_goal, m.away_total_shots, m.away_blocked_shots, m.away_shots_inside_box, m.away_shots_outside_box, m.away_fouls, m.away_corners, m.away_offsides, m.away_possession, m.away_yellow_cards, m.away_red_cards, m.away_saves, m.away_total_passes, m.away_passes_accurate, m.away_passes_pct " + \
                            f"FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (l.id = {league_id} AND m.season >= {start_season} AND m.season <= {end_season}) ORDER BY m.date ASC"
fixtures_df = execute_query(query)
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)

In [61]:
print(len(fixtures_df.index))

2820


In [62]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,...,away_corners,away_offsides,away_possession,away_yellow_cards,away_red_cards,away_saves,away_total_passes,away_passes_accurate,away_passes_pct,winner
2815,868104,2022-11-12 12:00:00,2022,Premier League (England),48,46,West Ham,Leicester,0,2,...,2.0,0.0,46.0,1.0,0.0,6.0,422.0,330.0,78.0,A
2816,868101,2022-11-12 14:30:00,2022,Premier League (England),34,49,Newcastle,Chelsea,1,0,...,3.0,0.0,51.0,2.0,0.0,2.0,471.0,396.0,84.0,H
2817,868105,2022-11-12 16:45:00,2022,Premier League (England),39,42,Wolves,Arsenal,0,2,...,4.0,1.0,63.0,2.0,0.0,2.0,579.0,517.0,89.0,A
2818,868097,2022-11-13 11:00:00,2022,Premier League (England),51,66,Brighton,Aston Villa,1,2,...,4.0,1.0,35.0,7.0,0.0,1.0,293.0,211.0,72.0,A
2819,868098,2022-11-13 13:30:00,2022,Premier League (England),36,33,Fulham,Manchester Utd,1,2,...,3.0,2.0,47.0,1.0,0.0,6.0,433.0,344.0,79.0,A


In [63]:
n_last_games = 5

In [64]:
print(fixtures_df.columns)

Index(['id', 'date', 'season', 'league', 'home_id', 'away_id', 'home_team',
       'away_team', 'home_score', 'away_score', 'home_odds', 'away_odds',
       'draw_odds', 'home_shots_on_goal', 'home_shots_off_goal',
       'home_total_shots', 'home_blocked_shots', 'home_shots_inside_box',
       'home_shots_outside_box', 'home_fouls', 'home_corners', 'home_offsides',
       'home_possession', 'home_yellow_cards', 'home_red_cards', 'home_saves',
       'home_total_passes', 'home_passes_accurate', 'home_passes_pct',
       'away_shots_on_goal', 'away_shots_off_goal', 'away_total_shots',
       'away_blocked_shots', 'away_shots_inside_box', 'away_shots_outside_box',
       'away_fouls', 'away_corners', 'away_offsides', 'away_possession',
       'away_yellow_cards', 'away_red_cards', 'away_saves',
       'away_total_passes', 'away_passes_accurate', 'away_passes_pct',
       'winner'],
      dtype='object')


In [65]:
def get_team_previous_games(team_id, game_date, season):
    home_previous_games = fixtures_df.loc[(fixtures_df['home_id'] == team_id) & (fixtures_df['date'] < game_date)]
    away_previous_games = fixtures_df.loc[(fixtures_df['away_id'] == team_id) & (fixtures_df['date'] < game_date)]
    
    if len(home_previous_games.index) == 0 or len(away_previous_games.index) == 0:
        return None
    
    home_previous_games.rename(columns = {'home_id': 'team_id', 'home_team': 'team_name',
       'home_score': 'team_score', 'home_shots_on_goal': 'team_shots_on_goal', 'home_shots_off_goal': 'team_shots_off_goal', 'home_total_shots': 'team_total_shots', 'home_blocked_shots': 'team_blocked_shots',
       'home_shots_inside_box': 'team_shots_inside_box', 'home_shots_outside_box': 'team_shots_outside_box', 'home_fouls': 'team_fouls', 'home_corners': 'team_corners', 'home_offsides': 'team_offsides',
       'home_possession': 'team_possession', 'home_yellow_cards': 'team_yellow_cards', 'home_red_cards': 'team_red_cards', 'home_saves': 'team_saves', 'home_total_passes': 'team_total_passes',
       'home_passes_accurate': 'team_passes_accurate', 'home_passes_pct': 'team_passes_pct',
                                          
       'away_id': 'opp_id', 'away_team': 'opp_name', 
       'away_score': 'opp_score', 'away_shots_on_goal': 'opp_shots_on_goal', 'away_shots_off_goal': 'opp_shots_off_goal', 'away_total_shots': 'opp_total_shots', 'away_blocked_shots': 'opp_blocked_shots',
       'away_shots_inside_box': 'opp_shots_inside_box', 'away_shots_outside_box': 'opp_shots_outside_box', 'away_fouls': 'opp_fouls', 'away_corners': 'opp_corners', 'away_offsides': 'opp_offsides',
       'away_possession': 'opp_possession', 'away_yellow_cards': 'opp_yellow_cards', 'away_red_cards': 'opp_red_cards', 'away_saves': 'opp_saves', 'away_total_passes': 'opp_total_passes',
       'away_passes_accurate': 'opp_passes_accurate', 'away_passes_pct': 'opp_passes_pct',
                                          
       'home_odds': 'team_odds', 'away_odds': 'opp_odds'}, inplace=True)
    home_previous_games['scenario'] = 'H'
    
    away_previous_games.rename(columns = {'away_id': 'team_id', 'away_team': 'team_name',
       'away_score': 'team_score', 'away_shots_on_goal': 'team_shots_on_goal', 'away_shots_off_goal': 'team_shots_off_goal', 'away_total_shots': 'team_total_shots', 'away_blocked_shots': 'team_blocked_shots',
       'away_shots_inside_box': 'team_shots_inside_box', 'away_shots_outside_box': 'team_shots_outside_box', 'away_fouls': 'team_fouls', 'away_corners': 'team_corners', 'away_offsides': 'team_offsides',
       'away_possession': 'team_possession', 'away_yellow_cards': 'team_yellow_cards', 'away_red_cards': 'team_red_cards', 'away_saves': 'team_saves', 'away_total_passes': 'team_total_passes',
       'away_passes_accurate': 'team_passes_accurate', 'away_passes_pct': 'team_passes_pct',
                                          
       'home_id': 'opp_id', 'home_team': 'opp_name', 
       'home_score': 'opp_score', 'home_shots_on_goal': 'opp_shots_on_goal', 'home_shots_off_goal': 'opp_shots_off_goal', 'home_total_shots': 'opp_total_shots', 'home_blocked_shots': 'opp_blocked_shots',
       'home_shots_inside_box': 'opp_shots_inside_box', 'home_shots_outside_box': 'opp_shots_outside_box', 'home_fouls': 'opp_fouls', 'home_corners': 'opp_corners', 'home_offsides': 'opp_offsides',
       'home_possession': 'opp_possession', 'home_yellow_cards': 'opp_yellow_cards', 'home_red_cards': 'opp_red_cards', 'home_saves': 'opp_saves', 'home_total_passes': 'opp_total_passes',
       'home_passes_accurate': 'opp_passes_accurate', 'home_passes_pct': 'opp_passes_pct',
                                          
       'home_odds': 'opp_odds', 'away_odds': 'team_odds'}, inplace=True)
    away_previous_games['scenario'] = 'A'
    
    previous_games = pd.concat([home_previous_games, away_previous_games], axis=0, ignore_index=True)
    previous_games.sort_values('date', inplace=True)
    
    previous_season_games = previous_games.loc[previous_games['season'] == season]
    home_previous_season_games = home_previous_games.loc[home_previous_games['season'] == season]
    away_previous_season_games = away_previous_games.loc[away_previous_games['season'] == season]
    
    return previous_season_games, home_previous_season_games, away_previous_season_games

In [66]:
def get_games_results(games, scenario):
    loser = 'A' if scenario == 'H' else 'H'
    return len(games.loc[games['winner'] == scenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_stats_mean(games, team_id, scenario):
    games = games.iloc[-n_last_games:,:]
    
    team_stats = [games['team_score'].mean(), games['opp_score'].mean(), games['team_shots_on_goal'].mean(), games['team_shots_off_goal'].mean(),
                 games['team_total_shots'].mean(), games['team_blocked_shots'].mean(), games['team_shots_inside_box'].mean(),
                 games['team_shots_outside_box'].mean(), games['team_fouls'].mean(), games['team_corners'].mean(),
                 games['team_offsides'].mean(), games['team_possession'].mean(), games['team_yellow_cards'].mean(),
                 games['team_red_cards'].mean(), games['team_saves'].mean(), games['team_total_passes'].mean(),
                 games['team_passes_accurate'].mean(), games['team_passes_pct'].mean()]
#     opp_stats = [games['opp_shots_on_goal'].mean(), games['opp_shots_off_goal'].mean(),
#                  games['opp_total_shots'].mean(), games['opp_blocked_shots'].mean(), games['opp_shots_inside_box'].mean(),
#                  games['opp_shots_outside_box'].mean(), games['opp_fouls'].mean(), games['opp_corners'].mean(),
#                  games['opp_offsides'].mean(), games['opp_possession'].mean(), games['opp_yellow_cards'].mean(),
#                  games['opp_red_cards'].mean(), games['opp_saves'].mean(), games['opp_total_passes'].mean(),
#                  games['opp_passes_accurate'].mean(), games['opp_passes_pct'].mean()]
    
    return team_stats

def get_historical_stats(games, home_games, away_games):
    total_games = len(games.index)
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    return points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses
    

def get_team_previous_games_stats(team_id, season, game_date, scenario):
    response = get_team_previous_games(team_id, game_date, season)
    if not response: return None
    
    previous_season_games, home_previous_season_games, away_previous_season_games = response
    
    total_games = len(previous_season_games.index)
    if total_games < 10 or (len(home_previous_season_games.index) < 5 and scenario == 'H') or (len(away_previous_season_games.index) < 5 and scenario == 'A'):
        return
    
    points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses = get_historical_stats(previous_season_games, home_previous_season_games, away_previous_season_games)
    
    previous_last_games = previous_season_games.iloc[-n_last_games:,:]
    home_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'H']
    away_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'A']
    
    points_pct_last_games, win_pct_last_games, draw_pct_last_games, loss_pct_last_games, home_wins_last_games, home_draws_last_games, home_losses_last_games, away_wins_last_games, away_draws_last_games, away_losses_last_games = get_historical_stats(previous_last_games, home_last_games, away_last_games)
    
    if scenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_previous_season_games.index)
        ha_draw_pct = home_draws * 100 / len(home_previous_season_games.index)
        ha_loss_pct = home_losses * 100 / len(home_previous_season_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_previous_season_games.index)
        ha_draw_pct = away_draws * 100 / len(away_previous_season_games.index)
        ha_loss_pct = away_losses * 100 / len(away_previous_season_games.index)
        
    game_stats = get_stats_mean(previous_season_games, team_id, scenario)
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, win_pct_last_games, draw_pct_last_games, loss_pct_last_games] + game_stats
        

In [67]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if(pd.isnull(game['home_odds']) or pd.isnull(game['home_passes_accurate'])):
        continue
    
    home_stats = get_team_previous_games_stats(game['home_id'], game['season'], game['date'], 'H')
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_id'], game['season'], game['date'], 'A')
    if not away_stats:
        continue
        
    data_model.append([game['id'], game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

2819/2820


In [68]:
columns = ['GAME_ID', 'GAME_DATE', 'SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_ODDS', 'AWAY_ODDS', 'DRAW_ODDS',
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_WIN_PCT_LAST_GAMES', f'HOME_DRAW_PCT_LAST_GAMES', f'HOME_LOSS_PCT_LAST_GAMES','HOME_SCORE_LAST_GAMES', 'HOME_CONCEDED_LAST_GAMES', 'HOME_SHOTS_ON_GOAL_LAST_GAMES', 'HOME_SHOTS_OFF_GOAL_LAST_GAMES', 'HOME_TOTAL_SHOTS_LAST_GAMES', 'HOME_BLOCKED_SHOTS_LAST_GAMES','HOME_SHOTS_INSIDE_BOX_LAST_GAMES', 'HOME_SHOTS_OUTSIDE_BOX_LAST_GAMES', 'HOME_FOULS_LAST_GAMES', 'HOME_CORNERS_LAST_GAMES', 'HOME_OFFSIDES_LAST_GAMES','HOME_POSSESSION_LAST_GAMES', 'HOME_YELLOW_CARDS_LAST_GAMES', 'HOME_RED_CARDS_LAST_GAMES', 'HOME_SAVES_LAST_GAMES', 'HOME_TOTAL_PASSES_LAST_GAMES','HOME_PASSES_ACCURATE_LAST_GAMES', 'HOME_PASSES_PCT_LAST_GAMES',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_WIN_PCT_LAST_GAMES', f'AWAY_DRAW_PCT_LAST_GAMES', f'AWAY_LOSS_PCT_LAST_GAMES','AWAY_SCORE_LAST_GAMES', 'AWAY_CONCEDED_LAST_GAMES', 'AWAY_SHOTS_ON_GOAL_LAST_GAMES', 'AWAY_SHOTS_OFF_GOAL_LAST_GAMES', 'AWAY_TOTAL_SHOTS_LAST_GAMES', 'AWAY_BLOCKED_SHOTS_LAST_GAMES','AWAY_SHOTS_INSIDE_BOX_LAST_GAMES', 'AWAY_SHOTS_OUTSIDE_BOX_LAST_GAMES', 'AWAY_FOULS_LAST_GAMES', 'AWAY_CORNERS_LAST_GAMES', 'AWAY_OFFSIDES_LAST_GAMES','AWAY_POSSESSION_LAST_GAMES', 'AWAY_YELLOW_CARDS_LAST_GAMES', 'AWAY_RED_CARDS_LAST_GAMES', 'AWAY_SAVES_LAST_GAMES', 'AWAY_TOTAL_PASSES_LAST_GAMES','AWAY_PASSES_ACCURATE_LAST_GAMES', 'AWAY_PASSES_PCT_LAST_GAMES',
           'OUTCOME', 'HOME_SCORE', 'AWAY_SCORE']
data_df = pd.DataFrame(data_model, columns=columns)

In [69]:
data_df.tail()

Unnamed: 0,GAME_ID,GAME_DATE,SEASON,HOME_TEAM,AWAY_TEAM,HOME_ODDS,AWAY_ODDS,DRAW_ODDS,HOME_PTS_PCT,HOME_WIN_PCT,...,AWAY_POSSESSION_LAST_GAMES,AWAY_YELLOW_CARDS_LAST_GAMES,AWAY_RED_CARDS_LAST_GAMES,AWAY_SAVES_LAST_GAMES,AWAY_TOTAL_PASSES_LAST_GAMES,AWAY_PASSES_ACCURATE_LAST_GAMES,AWAY_PASSES_PCT_LAST_GAMES,OUTCOME,HOME_SCORE,AWAY_SCORE
1973,868104,2022-11-12 12:00:00,2022,West Ham,Leicester,2.09,3.7,3.5,33.333333,26.666667,...,45.4,1.0,0.0,2.8,459.8,373.0,80.2,A,0,2
1974,868101,2022-11-12 14:30:00,2022,Newcastle,Chelsea,2.21,3.51,3.39,62.222222,46.666667,...,55.2,2.2,0.0,4.6,498.2,411.0,81.4,H,1,0
1975,868105,2022-11-12 16:45:00,2022,Wolves,Arsenal,6.47,1.56,4.15,24.444444,13.333333,...,59.25,1.25,0.0,2.25,527.75,450.75,85.0,A,0,2
1976,868097,2022-11-13 11:00:00,2022,Brighton,Aston Villa,1.75,4.99,3.77,51.111111,40.0,...,41.6,1.4,0.2,3.2,350.0,273.8,78.0,A,1,2
1977,868098,2022-11-13 13:30:00,2022,Fulham,Manchester Utd,4.17,1.88,3.82,44.444444,33.333333,...,55.6,2.4,0.0,2.0,518.4,428.2,82.4,A,1,2


In [70]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [71]:
parse_df_to_csv(data_df, f'leagues/{league_id}/data', '{}-{}.csv'.format(start_season, end_season))