In [1]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output
from datetime import datetime
import os
import utils.helper_functions as hf
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [3]:
league_id = 40 # Choose the league id
start_season = 2015
end_season = 2022

In [4]:
query = f"SELECT m.id, m.date, m.season, l.name AS league, ht.id as home_id, at.id as away_id, ht.name as home_team, at.name as away_team, m.home_score, m.away_score, m.home_odds, m.away_odds, m.draw_odds, " + \
                            "m.home_shots_on_goal, m.home_shots_off_goal, m.home_total_shots, m.home_blocked_shots, m.home_shots_inside_box, m.home_shots_outside_box, m.home_fouls, m.home_corners, m.home_offsides, m.home_possession, m.home_yellow_cards, m.home_red_cards, m.home_saves, m.home_total_passes, m.home_passes_accurate, m.home_passes_pct, " + \
                            "m.away_shots_on_goal, m.away_shots_off_goal, m.away_total_shots, m.away_blocked_shots, m.away_shots_inside_box, m.away_shots_outside_box, m.away_fouls, m.away_corners, m.away_offsides, m.away_possession, m.away_yellow_cards, m.away_red_cards, m.away_saves, m.away_total_passes, m.away_passes_accurate, m.away_passes_pct " + \
                            f"FROM matches AS m INNER JOIN teams AS ht ON (m.home_id = ht.id) INNER JOIN teams AS at ON (m.away_id = at.id) INNER JOIN leagues AS l ON (m.league_id = l.id) WHERE (l.id = {league_id} AND m.season >= {start_season} AND m.season <= {end_season}) ORDER BY m.date ASC"
fixtures_df = hf.execute_query(query)
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)

In [5]:
print(len(fixtures_df.index))

4161


In [6]:
fixtures_df.tail()

Unnamed: 0,id,date,season,league,home_id,away_id,home_team,away_team,home_score,away_score,...,away_corners,away_offsides,away_possession,away_yellow_cards,away_red_cards,away_saves,away_total_passes,away_passes_accurate,away_passes_pct,winner
4156,880978,2022-12-10 12:00:00,2022,Championship (England),62,37,Sheffield Utd,Huddersfield,1,0,...,1.0,1.0,39.0,2.0,0.0,3.0,384.0,259.0,67.0,H
4157,880979,2022-12-10 12:00:00,2022,Championship (England),75,43,Stoke,Cardiff,2,2,...,5.0,2.0,56.0,2.0,0.0,2.0,465.0,356.0,77.0,D
4158,880981,2022-12-10 12:00:00,2022,Championship (England),76,71,Swansea,Norwich,0,1,...,4.0,3.0,35.0,5.0,0.0,7.0,348.0,271.0,78.0,A
4159,880975,2022-12-11 10:00:00,2022,Championship (England),72,44,QPR,Burnley,0,3,...,6.0,1.0,57.0,2.0,0.0,2.0,537.0,455.0,85.0,A
4160,880982,2022-12-11 12:00:00,2022,Championship (England),38,64,Watford,Hull,0,0,...,5.0,0.0,48.0,4.0,0.0,3.0,473.0,384.0,81.0,D


In [7]:
n_last_games = 5

In [8]:
print(fixtures_df.columns)

Index(['id', 'date', 'season', 'league', 'home_id', 'away_id', 'home_team',
       'away_team', 'home_score', 'away_score', 'home_odds', 'away_odds',
       'draw_odds', 'home_shots_on_goal', 'home_shots_off_goal',
       'home_total_shots', 'home_blocked_shots', 'home_shots_inside_box',
       'home_shots_outside_box', 'home_fouls', 'home_corners', 'home_offsides',
       'home_possession', 'home_yellow_cards', 'home_red_cards', 'home_saves',
       'home_total_passes', 'home_passes_accurate', 'home_passes_pct',
       'away_shots_on_goal', 'away_shots_off_goal', 'away_total_shots',
       'away_blocked_shots', 'away_shots_inside_box', 'away_shots_outside_box',
       'away_fouls', 'away_corners', 'away_offsides', 'away_possession',
       'away_yellow_cards', 'away_red_cards', 'away_saves',
       'away_total_passes', 'away_passes_accurate', 'away_passes_pct',
       'winner'],
      dtype='object')


In [9]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if(pd.isnull(game['home_odds']) or pd.isnull(game['home_passes_accurate'])):
        continue
    
    home_stats = hf.get_team_previous_games_stats(game['home_id'], game['season'], game['date'], 'H', n_last_games, fixtures_df)
    if not home_stats:
        continue
        
    away_stats = hf.get_team_previous_games_stats(game['away_id'], game['season'], game['date'], 'A', n_last_games, fixtures_df)
    if not away_stats:
        continue
        
    data_model.append([game['id'], game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

4160/4161


In [10]:
columns = ['GAME_ID', 'GAME_DATE', 'SEASON', 'HOME_TEAM', 'AWAY_TEAM', 'HOME_ODDS', 'AWAY_ODDS', 'DRAW_ODDS',
           'HOME_PTS_PCT', 'HOME_WIN_PCT', 'HOME_DRAW_PCT', 'HOME_LOSS_PCT', 'HOME_HOME_WIN_PCT', 'HOME_HOME_DRAW_PCT', 'HOME_HOME_LOSS_PCT', f'HOME_WIN_PCT_LAST_GAMES', f'HOME_DRAW_PCT_LAST_GAMES', f'HOME_LOSS_PCT_LAST_GAMES','HOME_SCORE_LAST_GAMES', 'HOME_CONCEDED_LAST_GAMES', 'HOME_SHOTS_ON_GOAL_LAST_GAMES', 'HOME_SHOTS_OFF_GOAL_LAST_GAMES', 'HOME_TOTAL_SHOTS_LAST_GAMES', 'HOME_BLOCKED_SHOTS_LAST_GAMES','HOME_SHOTS_INSIDE_BOX_LAST_GAMES', 'HOME_SHOTS_OUTSIDE_BOX_LAST_GAMES', 'HOME_FOULS_LAST_GAMES', 'HOME_CORNERS_LAST_GAMES', 'HOME_OFFSIDES_LAST_GAMES','HOME_POSSESSION_LAST_GAMES', 'HOME_YELLOW_CARDS_LAST_GAMES', 'HOME_RED_CARDS_LAST_GAMES', 'HOME_SAVES_LAST_GAMES', 'HOME_TOTAL_PASSES_LAST_GAMES','HOME_PASSES_ACCURATE_LAST_GAMES', 'HOME_PASSES_PCT_LAST_GAMES',
           'AWAY_PTS_PCT', 'AWAY_WIN_PCT', 'AWAY_DRAW_PCT', 'AWAY_LOSS_PCT', 'AWAY_AWAY_WIN_PCT', 'AWAY_AWAY_DRAW_PCT', 'AWAY_AWAY_LOSS_PCT', f'AWAY_WIN_PCT_LAST_GAMES', f'AWAY_DRAW_PCT_LAST_GAMES', f'AWAY_LOSS_PCT_LAST_GAMES','AWAY_SCORE_LAST_GAMES', 'AWAY_CONCEDED_LAST_GAMES', 'AWAY_SHOTS_ON_GOAL_LAST_GAMES', 'AWAY_SHOTS_OFF_GOAL_LAST_GAMES', 'AWAY_TOTAL_SHOTS_LAST_GAMES', 'AWAY_BLOCKED_SHOTS_LAST_GAMES','AWAY_SHOTS_INSIDE_BOX_LAST_GAMES', 'AWAY_SHOTS_OUTSIDE_BOX_LAST_GAMES', 'AWAY_FOULS_LAST_GAMES', 'AWAY_CORNERS_LAST_GAMES', 'AWAY_OFFSIDES_LAST_GAMES','AWAY_POSSESSION_LAST_GAMES', 'AWAY_YELLOW_CARDS_LAST_GAMES', 'AWAY_RED_CARDS_LAST_GAMES', 'AWAY_SAVES_LAST_GAMES', 'AWAY_TOTAL_PASSES_LAST_GAMES','AWAY_PASSES_ACCURATE_LAST_GAMES', 'AWAY_PASSES_PCT_LAST_GAMES',
           'OUTCOME', 'HOME_SCORE', 'AWAY_SCORE']
data_df = pd.DataFrame(data_model, columns=columns)

In [11]:
data_df.tail()

Unnamed: 0,GAME_ID,GAME_DATE,SEASON,HOME_TEAM,AWAY_TEAM,HOME_ODDS,AWAY_ODDS,DRAW_ODDS,HOME_PTS_PCT,HOME_WIN_PCT,...,AWAY_POSSESSION_LAST_GAMES,AWAY_YELLOW_CARDS_LAST_GAMES,AWAY_RED_CARDS_LAST_GAMES,AWAY_SAVES_LAST_GAMES,AWAY_TOTAL_PASSES_LAST_GAMES,AWAY_PASSES_ACCURATE_LAST_GAMES,AWAY_PASSES_PCT_LAST_GAMES,OUTCOME,HOME_SCORE,AWAY_SCORE
3127,880973,2022-12-10 12:00:00,2022,Middlesbrough,Luton,1.96,4.05,3.47,44.444444,33.333333,...,50.8,2.0,0.0,2.8,325.6,215.6,65.8,H,2,1
3128,880974,2022-12-10 12:00:00,2022,Millwall,Wigan,1.98,4.29,3.27,47.619048,38.095238,...,47.4,1.8,0.0,1.2,366.0,268.0,72.0,D,1,1
3129,880978,2022-12-10 12:00:00,2022,Sheffield Utd,Huddersfield,1.51,6.7,4.31,61.904762,52.380952,...,36.6,2.8,0.0,3.2,319.2,225.8,68.4,H,1,0
3130,880979,2022-12-10 12:00:00,2022,Stoke,Cardiff,2.11,3.78,3.26,36.507937,28.571429,...,50.0,1.0,0.0,1.8,420.4,326.6,77.2,D,2,2
3131,880981,2022-12-10 12:00:00,2022,Swansea,Norwich,2.44,2.95,3.4,46.031746,33.333333,...,48.2,2.8,0.0,2.4,410.4,319.0,76.6,A,0,1


In [12]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [13]:
parse_df_to_csv(data_df, f'leagues/{league_id}/data', '{}-{}.csv'.format(start_season, end_season))