In [23]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output
from datetime import datetime
import os
import utils.helper_functions as hf
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [9]:
league = 'premier-league'
start_season = 2017
end_season = 2023

In [26]:
seasons = []

for season in range(start_season, end_season):
    seasons.append(pd.read_csv(f"./leagues_v2/{league}/{season}-{season + 1}.csv", index_col = 0).reset_index(drop=True))
    seasons[-1]['season'] = season

fixtures_df = pd.concat(seasons, axis=0).reset_index(drop=True)
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)

In [27]:
print(len(fixtures_df))

2267


In [28]:
display(fixtures_df)

Unnamed: 0,date,week,home_team,home_xg,home_score,away_score,away_xg,away_team,home_odds,away_odds,draw_odds,season,winner
0,2017-08-11,1,Arsenal,2.5,4,3,1.5,Leicester City,1.49,6.84,4.61,2017,H
1,2017-08-12,1,Watford,2.1,3,3,2.6,Liverpool,6.45,1.56,4.23,2017,D
2,2017-08-12,1,Crystal Palace,1.1,0,3,1.5,Huddersfield,1.80,5.00,3.56,2017,A
3,2017-08-12,1,West Brom,1.3,1,0,0.5,Bournemouth,2.56,3.01,3.22,2017,H
4,2017-08-12,1,Chelsea,1.5,2,3,0.6,Burnley,1.30,12.44,5.63,2017,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262,2023-05-20,37,Wolves,1.6,1,1,2.0,Everton,3.14,2.31,3.44,2022,D
2263,2023-05-20,37,Nott'ham Forest,0.6,1,0,0.6,Arsenal,5.18,1.64,4.16,2022,H
2264,2023-05-21,37,West Ham,2.1,3,1,0.9,Leeds United,2.61,2.68,3.46,2022,H
2265,2023-05-21,37,Brighton,2.4,3,1,0.6,Southampton,1.21,12.68,7.30,2022,H


In [17]:
n_last_games = 5

In [18]:
print(fixtures_df.columns)

Index(['date', 'week', 'home_team', 'home_xg', 'home_score', 'away_score',
       'away_xg', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 'season',
       'winner'],
      dtype='object')


In [19]:
def get_games_results(games, scenario):
    loser = 'A' if scenario == 'H' else 'H'
    return len(games.loc[games['winner'] == scenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_stats_mean(games, n_last_games, scenario):
    games = games.iloc[-n_last_games:,:]
    
    team_stats = [games['team_score'].mean(), games['opp_score'].mean(), games['team_xg'].mean(), games['opp_xg'].mean()]
    
    return team_stats

def get_historical_stats(games, home_games, away_games):
    total_games = len(games.index)
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    return points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses

def get_team_previous_games(team, game_date, season, fixtures_df):
    home_previous_games = fixtures_df.loc[(fixtures_df['home_team'] == team) & (fixtures_df['date'] < game_date)]
    away_previous_games = fixtures_df.loc[(fixtures_df['away_team'] == team) & (fixtures_df['date'] < game_date)]
    
    if len(home_previous_games.index) == 0 or len(away_previous_games.index) == 0:
        return None
    
    home_previous_games.rename(columns = {
        'home_team': 'team', 'home_score': 'team_score', 'home_xg': 'team_xg',
                                          
        'away_team': 'opp', 'away_score': 'opp_score', 'away_xg': 'opp_xg',
                                          
        'home_odds': 'team_odds', 'away_odds': 'opp_odds'}, inplace=True)
    home_previous_games['scenario'] = 'H'
    
    away_previous_games.rename(columns = {
        'away_team': 'team', 'away_score': 'away_score', 'away_xg': 'team_xg',
                                          
        'home_team': 'opp', 'home_score': 'home_score', 'home_xg': 'opp_xg',
                                          
        'away_odds': 'team_odds', 'home_odds': 'opp_odds'}, inplace=True)
    away_previous_games['scenario'] = 'A'
    
    previous_games = pd.concat([home_previous_games, away_previous_games], axis=0, ignore_index=True)
    previous_games.sort_values('date', inplace=True)
    
    previous_season_games = previous_games.loc[previous_games['season'] == season]
    home_previous_season_games = home_previous_games.loc[home_previous_games['season'] == season]
    away_previous_season_games = away_previous_games.loc[away_previous_games['season'] == season]
    
    return previous_season_games, home_previous_season_games, away_previous_season_games

def get_team_previous_games_stats(team, season, game_date, scenario, n_last_games, fixtures_df):
    response = get_team_previous_games(team, game_date, season, fixtures_df)
    if not response: return None
    
    previous_season_games, home_previous_season_games, away_previous_season_games = response
    
    total_games = len(previous_season_games.index)
    if total_games < 10 or (len(home_previous_season_games.index) < 5 and scenario == 'H') or (len(away_previous_season_games.index) < 5 and scenario == 'A'):
        return
    
    points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses = get_historical_stats(previous_season_games, home_previous_season_games, away_previous_season_games)
    
    previous_last_games = previous_season_games.iloc[-n_last_games:,:]
    home_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'H']
    away_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'A']
    
    points_pct_last_games, win_pct_last_games, draw_pct_last_games, loss_pct_last_games, home_wins_last_games, home_draws_last_games, home_losses_last_games, away_wins_last_games, away_draws_last_games, away_losses_last_games = get_historical_stats(previous_last_games, home_last_games, away_last_games)
    
    if scenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_previous_season_games.index)
        ha_draw_pct = home_draws * 100 / len(home_previous_season_games.index)
        ha_loss_pct = home_losses * 100 / len(home_previous_season_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_previous_season_games.index)
        ha_draw_pct = away_draws * 100 / len(away_previous_season_games.index)
        ha_loss_pct = away_losses * 100 / len(away_previous_season_games.index)
        
    game_stats = get_stats_mean(previous_season_games, n_last_games, scenario)
    
    if any([np.isnan(s) for s in game_stats]): return None
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, win_pct_last_games, draw_pct_last_games, loss_pct_last_games] + game_stats

In [29]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if pd.isnull(game['home_odds']):
        continue
    
    home_stats = get_team_previous_games_stats(game['home_team'], game['season'], game['date'], 'H', n_last_games, fixtures_df)
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_team'], game['season'], game['date'], 'A', n_last_games, fixtures_df)
    if not away_stats:
        continue
        
    data_model.append([game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

2266/2267


In [31]:
columns = ['game_date', 'season', 'home_team', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 
           'home_pts_pct', 'home_win_pct', 'home_draw_pct', 'home_loss_pct', 'home_home_win_pct', 'home_home_draw_pct', 'home_home_loss_pct', 'home_win_pct_last_games', 'home_draw_pct_last_games', 'home_loss_pct_last_games', 'home_score_last_games', 'home_conceded_last_games', 'home_xg_last_games', 'home_conceded_xg_last_games', 
           'away_pts_pct', 'away_win_pct', 'away_draw_pct', 'away_loss_pct', 'away_away_win_pct', 'away_away_draw_pct', 'away_away_loss_pct', 'away_win_pct_last_games', 'away_draw_pct_last_games', 'away_loss_pct_last_games', 'away_score_last_games', 'away_conceded_last_games', 'away_xg_last_games', 'away_conceded_xg_last_games', 
           'outcome', 'home_score', 'away_score']
data_df = pd.DataFrame(data_model, columns=columns)

In [33]:
display(data_df)

Unnamed: 0,game_date,season,home_team,away_team,home_odds,away_odds,draw_odds,home_pts_pct,home_win_pct,home_draw_pct,...,away_win_pct_last_games,away_draw_pct_last_games,away_loss_pct_last_games,away_score_last_games,away_conceded_last_games,away_xg_last_games,away_conceded_xg_last_games,outcome,home_score,away_score
0,2017-11-04,2017,Stoke City,Leicester City,2.62,2.90,3.25,36.666667,30.000000,20.000000,...,40.0,40.0,20.0,1.666667,1.333333,1.58,0.94,D,2,2
1,2017-11-04,2017,Southampton,Burnley,1.67,6.36,3.65,43.333333,30.000000,40.000000,...,40.0,40.0,20.0,0.666667,0.333333,0.60,1.18,A,0,1
2,2017-11-04,2017,Swansea City,Brighton,2.56,3.19,3.02,26.666667,20.000000,20.000000,...,40.0,40.0,20.0,1.000000,0.666667,1.00,1.28,A,0,1
3,2017-11-04,2017,Huddersfield,West Brom,2.86,2.91,2.94,40.000000,30.000000,30.000000,...,0.0,40.0,60.0,2.000000,2.500000,0.98,1.84,H,1,0
4,2017-11-04,2017,Newcastle Utd,Bournemouth,1.84,4.72,3.61,46.666667,40.000000,20.000000,...,20.0,20.0,60.0,0.000000,0.500000,0.70,1.44,A,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1650,2023-05-20,2022,Wolves,Everton,3.14,2.31,3.44,37.037037,30.555556,19.444444,...,20.0,40.0,40.0,0.500000,3.500000,1.66,2.06,D,1,1
1651,2023-05-20,2022,Nott'ham Forest,Arsenal,5.18,1.64,4.16,31.481481,22.222222,27.777778,...,40.0,20.0,40.0,2.000000,2.333333,1.38,1.52,H,1,0
1652,2023-05-21,2022,West Ham,Leeds United,2.61,2.68,3.46,34.259259,27.777778,19.444444,...,0.0,40.0,60.0,1.500000,1.500000,1.24,2.14,H,3,1
1653,2023-05-21,2022,Brighton,Southampton,1.21,12.68,7.30,55.238095,48.571429,20.000000,...,0.0,20.0,80.0,0.000000,1.500000,1.22,2.18,H,3,1


In [34]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [37]:
parse_df_to_csv(data_df, f'leagues_data/{league}', '{}-{}.csv'.format(start_season, end_season))