In [1]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output
from datetime import datetime
import os
import utils.builder_functions as bf
import warnings
warnings.filterwarnings('ignore')

In [2]:
league = 'mls'
start_season = 2018
end_season = 2020

In [3]:
seasons = []

for season in range(start_season, end_season):
    seasons.append(pd.read_csv(f"./leagues_v2/{league}/data/{season}-{season + 1}.csv", index_col = 0).reset_index(drop=True))
    seasons[-1]['season'] = season

fixtures_df = pd.concat(seasons, axis=0).reset_index(drop=True)
fixtures_df.dropna(subset=['home_odds', 'away_odds', 'away_odds'], inplace=True)
fixtures_df['winner'] = fixtures_df.apply(lambda x: bf.get_winner(x['home_score'], x['away_score']), axis=1)

In [4]:
print(len(fixtures_df))

799


In [5]:
display(fixtures_df)

Unnamed: 0,date,week,home_team,home_xg,home_score,away_score,away_xg,away_team,home_odds,away_odds,draw_odds,season,winner
0,2018-03-03,Regular Season,Toronto FC,1.4,0,2,1.5,Columbus Crew,1.54,5.85,4.09,2018,A
1,2018-03-03,Regular Season,Dynamo,1.9,4,0,1.8,Atlanta Utd,2.47,2.73,3.40,2018,H
2,2018-03-04,Regular Season,Seattle,1.9,0,1,0.3,Los Angeles FC,1.65,4.83,3.95,2018,A
3,2018-03-04,Regular Season,Vancouver,1.2,2,1,0.9,Montreal Impact,1.82,4.15,3.63,2018,H
4,2018-03-03,Regular Season,Philadelphia,3.5,2,0,0.9,New England,1.91,3.86,3.55,2018,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,2019-10-06,Regular Season,Toronto FC,2.0,1,0,0.9,Columbus Crew,1.57,5.04,4.26,2019,H
812,2019-10-06,Regular Season,Philadelphia,2.2,1,2,1.5,NYCFC,1.91,3.44,4.00,2019,A
813,2019-10-06,Regular Season,Orlando City,3.7,2,5,2.3,Chicago Fire,2.53,2.43,3.82,2019,A
814,2019-10-06,Regular Season,D.C. United,2.7,0,0,0.4,FC Cincinnati,1.63,4.88,4.08,2019,D


In [6]:
n_last_games = 5

In [7]:
print(fixtures_df.columns)

Index(['date', 'week', 'home_team', 'home_xg', 'home_score', 'away_score',
       'away_xg', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 'season',
       'winner'],
      dtype='object')


In [8]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if pd.isnull(game['home_odds']):
        continue
    
    home_stats = bf.get_team_previous_games_stats(game['home_team'], game['season'], game['date'], 'H', n_last_games, fixtures_df)
    if not home_stats:
        continue
        
    away_stats = bf.get_team_previous_games_stats(game['away_team'], game['season'], game['date'], 'A', n_last_games, fixtures_df)
    if not away_stats:
        continue
        
    data_model.append([game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

815/799


In [9]:
data_df = bf.build_formatted_csv(data_model)

In [10]:
display(data_df)

Unnamed: 0,game_date,season,home_team,away_team,home_odds,away_odds,draw_odds,home_pts_pct,home_win_pct,home_draw_pct,...,away_win_pct_last_games,away_draw_pct_last_games,away_loss_pct_last_games,away_score_last_games,away_conceded_last_games,away_xg_last_games,away_conceded_xg_last_games,outcome,home_score,away_score
0,2018-05-19,2018,Philadelphia,Real Salt Lake,1.72,4.52,3.83,36.666667,30.000000,20.000000,...,40.0,0.0,60.0,602.6,8046.4,1.70,2.46,H,4,1
1,2018-05-19,2018,New England,Columbus Crew,2.29,2.85,3.60,56.666667,50.000000,20.000000,...,60.0,40.0,0.0,4402.6,4200.0,1.92,0.82,A,0,1
2,2018-05-20,2018,Minnesota Utd,Sporting KC,3.77,1.89,3.69,36.363636,36.363636,0.000000,...,60.0,20.0,20.0,5202.4,4020.0,2.62,1.06,D,1,1
3,2018-05-26,2018,NY Red Bulls,Philadelphia,1.48,6.17,4.41,70.000000,70.000000,0.000000,...,60.0,0.0,40.0,6004.8,4620.2,1.30,1.10,D,0,0
4,2018-05-25,2018,Dynamo,NYCFC,2.06,3.25,3.64,50.000000,40.000000,30.000000,...,40.0,20.0,40.0,604.8,6284.0,1.90,1.54,H,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,2019-10-06,2019,Toronto FC,Columbus Crew,1.57,5.04,4.26,47.474747,36.363636,33.333333,...,60.0,40.0,0.0,2.0,0.8,1.22,1.78,H,1,0
517,2019-10-06,2019,Philadelphia,NYCFC,1.91,3.44,4.00,55.555556,48.484848,21.212121,...,40.0,40.0,20.0,1.6,1.2,1.40,1.78,A,1,2
518,2019-10-06,2019,Orlando City,Chicago Fire,2.53,2.43,3.82,37.373737,27.272727,30.303030,...,20.0,60.0,20.0,1.6,1.0,1.94,1.08,A,2,5
519,2019-10-06,2019,D.C. United,FC Cincinnati,1.63,4.88,4.08,49.494949,39.393939,30.303030,...,20.0,40.0,40.0,0.6,1.6,1.20,1.54,D,0,0


In [11]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [12]:
parse_df_to_csv(data_df, f'leagues_v2/{league}/formatted_data', '{}-{}.csv'.format(start_season, end_season))