In [1]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output
from datetime import datetime
import os
import utils.helper_functions as hf
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
def get_winner(home_score, away_score):
    if home_score > away_score:
        return 'H'
    elif away_score > home_score:
        return 'A'
    else:
        return 'D'

In [3]:
league = 'mls'
start_season = 2018
end_season = 2024

In [40]:
seasons = []

for season in range(start_season, end_season):
    seasons.append(pd.read_csv(f"./leagues_v2/{league}/data/{season}-{season + 1}.csv", index_col = 0).reset_index(drop=True))
    seasons[-1]['season'] = season

fixtures_df = pd.concat(seasons, axis=0).reset_index(drop=True)
fixtures_df.dropna(subset=['home_odds', 'away_odds', 'away_odds'], inplace=True)
fixtures_df['winner'] = fixtures_df.apply(lambda x: get_winner(x['home_score'], x['away_score']), axis=1)

In [41]:
print(len(fixtures_df))

2182


In [43]:
display(fixtures_df)

Unnamed: 0,date,week,home_team,home_xg,home_score,away_score,away_xg,away_team,home_odds,away_odds,draw_odds,season,winner
0,2018-03-03,Regular Season,Toronto FC,1.4,0,2,1.5,Columbus Crew,1.54,5.85,4.09,2018,A
1,2018-03-03,Regular Season,Dynamo,1.9,4,0,1.8,Atlanta Utd,2.47,2.73,3.4,2018,H
2,2018-03-04,Regular Season,Seattle,1.9,0,1,0.3,Los Angeles FC,1.65,4.83,3.95,2018,A
3,2018-03-04,Regular Season,Vancouver,1.2,2,1,0.9,Montreal Impact,1.82,4.15,3.63,2018,H
4,2018-03-03,Regular Season,Philadelphia,3.5,2,0,0.9,New England,1.91,3.86,3.55,2018,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2301,2023-05-20,Sat,FC Dallas,1.6,1,1,0.6,Dynamo FC,1.87,4.27,3.45,2023,D
2302,2023-05-20,Sat,Portland Timbers,0.7,0,1,0.6,Minnesota Utd,1.96,3.65,3.64,2023,A
2303,2023-05-20,Sat,Austin,1.2,1,0,0.2,Toronto FC,2.03,3.58,3.46,2023,H
2304,2023-05-20,Sat,Colorado Rapids,1.1,2,3,0.8,Real Salt Lake,1.81,4.25,3.66,2023,A


In [44]:
n_last_games = 5

In [45]:
print(fixtures_df.columns)

Index(['date', 'week', 'home_team', 'home_xg', 'home_score', 'away_score',
       'away_xg', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 'season',
       'winner'],
      dtype='object')


In [46]:
def get_games_results(games, scenario):
    loser = 'A' if scenario == 'H' else 'H'
    return len(games.loc[games['winner'] == scenario].index), len(games.loc[games['winner'] == 'D'].index), len(games.loc[games['winner'] == loser].index)

def get_stats_mean(games, n_last_games, scenario):
    games = games.iloc[-n_last_games:,:]
    
    team_stats = [games['team_score'].mean(), games['opp_score'].mean(), games['team_xg'].mean(), games['opp_xg'].mean()]
    
    return team_stats

def get_historical_stats(games, home_games, away_games):
    total_games = len(games.index)
    home_wins, home_draws, home_losses = get_games_results(home_games, 'H')
    away_wins, away_draws, away_losses = get_games_results(away_games, 'A')
    
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    
    win_pct = total_wins * 100 / total_games
    draw_pct = total_draws * 100 / total_games
    loss_pct = total_losses * 100 / total_games
    
    points_achieved = total_wins * 3 + total_draws
    points_pct = (points_achieved * 100) / (total_games * 3)
    
    return points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses

def get_team_previous_games(team, game_date, season, fixtures_df):
    home_previous_games = fixtures_df.loc[(fixtures_df['home_team'] == team) & (fixtures_df['date'] < game_date)]
    away_previous_games = fixtures_df.loc[(fixtures_df['away_team'] == team) & (fixtures_df['date'] < game_date)]
    
    if len(home_previous_games.index) == 0 or len(away_previous_games.index) == 0:
        return None
    
    home_previous_games.rename(columns = {
        'home_team': 'team', 'home_score': 'team_score', 'home_xg': 'team_xg',
                                          
        'away_team': 'opp', 'away_score': 'opp_score', 'away_xg': 'opp_xg',
                                          
        'home_odds': 'team_odds', 'away_odds': 'opp_odds'}, inplace=True)
    home_previous_games['scenario'] = 'H'
    
    away_previous_games.rename(columns = {
        'away_team': 'team', 'away_score': 'team_score', 'away_xg': 'team_xg',
                                          
        'home_team': 'opp', 'home_score': 'opp_score', 'home_xg': 'opp_xg',
                                          
        'away_odds': 'team_odds', 'home_odds': 'opp_odds'}, inplace=True)
    away_previous_games['scenario'] = 'A'
    
    previous_games = pd.concat([home_previous_games, away_previous_games], axis=0, ignore_index=True)
    previous_games.sort_values('date', inplace=True)
    
    previous_season_games = previous_games.loc[previous_games['season'] == season]
    home_previous_season_games = home_previous_games.loc[home_previous_games['season'] == season]
    away_previous_season_games = away_previous_games.loc[away_previous_games['season'] == season]
    
    return previous_season_games, home_previous_season_games, away_previous_season_games

def get_team_previous_games_stats(team, season, game_date, scenario, n_last_games, fixtures_df):
    response = get_team_previous_games(team, game_date, season, fixtures_df)
    if not response: return None
    
    previous_season_games, home_previous_season_games, away_previous_season_games = response
    
    total_games = len(previous_season_games.index)
    if total_games < 10 or (len(home_previous_season_games.index) < 5 and scenario == 'H') or (len(away_previous_season_games.index) < 5 and scenario == 'A'):
        return
    
    points_pct, win_pct, draw_pct, loss_pct, home_wins, home_draws, home_losses, away_wins, away_draws, away_losses = get_historical_stats(previous_season_games, home_previous_season_games, away_previous_season_games)
    
    previous_last_games = previous_season_games.iloc[-n_last_games:,:]
    home_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'H']
    away_last_games = previous_last_games.loc[previous_last_games['scenario'] == 'A']
    
    points_pct_last_games, win_pct_last_games, draw_pct_last_games, loss_pct_last_games, home_wins_last_games, home_draws_last_games, home_losses_last_games, away_wins_last_games, away_draws_last_games, away_losses_last_games = get_historical_stats(previous_last_games, home_last_games, away_last_games)
    
    if scenario == 'H':
        ha_win_pct = home_wins * 100 / len(home_previous_season_games.index)
        ha_draw_pct = home_draws * 100 / len(home_previous_season_games.index)
        ha_loss_pct = home_losses * 100 / len(home_previous_season_games.index)
    else:
        ha_win_pct = away_wins * 100 / len(away_previous_season_games.index)
        ha_draw_pct = away_draws * 100 / len(away_previous_season_games.index)
        ha_loss_pct = away_losses * 100 / len(away_previous_season_games.index)
        
    game_stats = get_stats_mean(previous_season_games, n_last_games, scenario)
    
    if any([np.isnan(s) for s in game_stats]): return None
    
    return [points_pct, win_pct, draw_pct, loss_pct, ha_win_pct, ha_draw_pct, ha_loss_pct, win_pct_last_games, draw_pct_last_games, loss_pct_last_games] + game_stats

In [47]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if pd.isnull(game['home_odds']):
        continue
    
    home_stats = get_team_previous_games_stats(game['home_team'], game['season'], game['date'], 'H', n_last_games, fixtures_df)
    if not home_stats:
        continue
        
    away_stats = get_team_previous_games_stats(game['away_team'], game['season'], game['date'], 'A', n_last_games, fixtures_df)
    if not away_stats:
        continue
        
    data_model.append([game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

2305/2182


In [48]:
columns = ['game_date', 'season', 'home_team', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 
           'home_pts_pct', 'home_win_pct', 'home_draw_pct', 'home_loss_pct', 'home_home_win_pct', 'home_home_draw_pct', 'home_home_loss_pct', 'home_win_pct_last_games', 'home_draw_pct_last_games', 'home_loss_pct_last_games', 'home_score_last_games', 'home_conceded_last_games', 'home_xg_last_games', 'home_conceded_xg_last_games', 
           'away_pts_pct', 'away_win_pct', 'away_draw_pct', 'away_loss_pct', 'away_away_win_pct', 'away_away_draw_pct', 'away_away_loss_pct', 'away_win_pct_last_games', 'away_draw_pct_last_games', 'away_loss_pct_last_games', 'away_score_last_games', 'away_conceded_last_games', 'away_xg_last_games', 'away_conceded_xg_last_games', 
           'outcome', 'home_score', 'away_score']
data_df = pd.DataFrame(data_model, columns=columns)

In [49]:
display(data_df)

Unnamed: 0,game_date,season,home_team,away_team,home_odds,away_odds,draw_odds,home_pts_pct,home_win_pct,home_draw_pct,...,away_win_pct_last_games,away_draw_pct_last_games,away_loss_pct_last_games,away_score_last_games,away_conceded_last_games,away_xg_last_games,away_conceded_xg_last_games,outcome,home_score,away_score
0,2018-05-19,2018,Philadelphia,Real Salt Lake,1.72,4.52,3.83,36.666667,30.000000,20.000000,...,40.0,0.0,60.0,602.6,8046.4,1.70,2.46,H,4,1
1,2018-05-19,2018,New England,Columbus Crew,2.29,2.85,3.6,56.666667,50.000000,20.000000,...,60.0,40.0,0.0,4402.6,4200.0,1.92,0.82,A,0,1
2,2018-05-20,2018,Minnesota Utd,Sporting KC,3.77,1.89,3.69,36.363636,36.363636,0.000000,...,60.0,20.0,20.0,5202.4,4020.0,2.62,1.06,D,1,1
3,2018-05-26,2018,NY Red Bulls,Philadelphia,1.48,6.17,4.41,70.000000,70.000000,0.000000,...,60.0,0.0,40.0,6004.8,4620.2,1.30,1.10,D,0,0
4,2018-05-25,2018,Dynamo,NYCFC,2.06,3.25,3.64,50.000000,40.000000,30.000000,...,40.0,20.0,40.0,604.8,6284.0,1.90,1.54,H,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1277,2023-05-20,2023,FC Dallas,Dynamo FC,1.87,4.27,3.45,58.333333,50.000000,25.000000,...,20.0,40.0,40.0,0.4,0.6,0.64,1.58,D,1,1
1278,2023-05-20,2023,Portland Timbers,Minnesota Utd,1.96,3.65,3.64,41.025641,30.769231,30.769231,...,20.0,20.0,60.0,0.6,1.4,1.46,1.18,A,0,1
1279,2023-05-20,2023,Austin,Toronto FC,2.03,3.58,3.46,36.111111,25.000000,33.333333,...,20.0,20.0,60.0,0.6,1.6,0.98,1.54,H,1,0
1280,2023-05-20,2023,Colorado Rapids,Real Salt Lake,1.81,4.25,3.66,30.769231,15.384615,46.153846,...,20.0,60.0,20.0,0.6,0.8,1.10,1.16,A,2,3


In [50]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [51]:
parse_df_to_csv(data_df, f'leagues_v2/{league}/formatted_data', '{}-{}.csv'.format(start_season, end_season))