In [1]:
import requests
import json
import time
import pandas as pd
from IPython.display import clear_output
from datetime import datetime
import os
import utils.builder_functions as bf
import warnings
warnings.filterwarnings('ignore')

In [2]:
league = 'serie-a'
start_season = 2019
end_season = 2024

In [3]:
seasons = []

for season in range(start_season, end_season):
    seasons.append(pd.read_csv(f"./leagues/{league}/data/{season}-{season + 1}.csv", index_col = 0).reset_index(drop=True))
    seasons[-1]['season'] = season

fixtures_df = pd.concat(seasons, axis=0).reset_index(drop=True)
fixtures_df.dropna(subset=['home_odds', 'away_odds', 'away_odds'], inplace=True)
fixtures_df['winner'] = fixtures_df.apply(lambda x: bf.get_winner(x['home_score'], x['away_score']), axis=1)

In [4]:
print(len(fixtures_df))

1600


In [5]:
display(fixtures_df)

Unnamed: 0,date,week,home_team,home_xg,home_score,away_score,away_xg,away_team,home_odds,away_odds,draw_odds,season,winner
0,2019-04-27,1,São Paulo,1.1,2,0,0.5,Botafogo (RJ),1.74,5.1,3.35,2019,H
1,2019-04-27,1,Chapecoense,1.4,2,0,1.2,Internacional,2.9,2.6,2.93,2019,H
2,2019-04-27,1,Atlético Mineiro,2.4,2,1,0.6,Avaí,1.53,6.25,3.91,2019,H
3,2019-04-28,1,Grêmio,3.0,1,2,0.7,Santos,1.69,5.35,3.41,2019,A
4,2019-04-28,1,Bahia,1.4,3,2,0.4,Corinthians,2.21,3.45,3.04,2019,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,2023-05-28,8,Internacional,0.9,2,0,1.2,Bahia,2.06,3.55,3.42,2023,H
1596,2023-05-28,8,Bragantino,1.4,2,0,0.3,Santos,1.75,5.06,3.41,2023,H
1597,2023-05-28,8,Atlético Mineiro,0.9,1,1,1.1,Palmeiras,2.23,3.29,3.24,2023,D
1598,2023-05-28,8,Botafogo (RJ),2.2,2,0,0.3,América (MG),1.79,4.51,3.59,2023,H


In [6]:
n_last_games = 5

In [7]:
print(fixtures_df.columns)

Index(['date', 'week', 'home_team', 'home_xg', 'home_score', 'away_score',
       'away_xg', 'away_team', 'home_odds', 'away_odds', 'draw_odds', 'season',
       'winner'],
      dtype='object')


In [8]:
data_model = []

for index, game in fixtures_df.iterrows():
    clear_output(wait=True)
    
    print("{}/{}".format(index, len(fixtures_df.index)))
    
    if pd.isnull(game['home_odds']):
        continue
    
    home_stats = bf.get_team_previous_games_stats(game['home_team'], game['season'], game['date'], 'H', n_last_games, fixtures_df)
    if not home_stats:
        continue
        
    away_stats = bf.get_team_previous_games_stats(game['away_team'], game['season'], game['date'], 'A', n_last_games, fixtures_df)
    if not away_stats:
        continue
        
    data_model.append([game['date'], game['season'], game['home_team'], game['away_team'], game['home_odds'], game['away_odds'], game['draw_odds']] + home_stats + away_stats + [game['winner'], game['home_score'], game['away_score']])

1599/1600


In [9]:
data_df = bf.build_formatted_csv(data_model)

In [10]:
display(data_df)

Unnamed: 0,game_date,season,home_team,away_team,home_odds,away_odds,draw_odds,home_pts_pct,home_win_pct,home_draw_pct,...,away_win_pct_last_games,away_draw_pct_last_games,away_loss_pct_last_games,away_score_last_games,away_conceded_last_games,away_xg_last_games,away_conceded_xg_last_games,outcome,home_score,away_score
0,2019-07-20,2019,Vasco da Gama,Fluminense,2.33,3.08,3.17,30.000000,20.000000,30.000000,...,0.0,60.0,40.0,0.8,1.6,1.30,1.30,H,2,1
1,2019-07-20,2019,Bahia,Cruzeiro,1.66,5.44,3.56,46.666667,40.000000,20.000000,...,0.0,60.0,40.0,0.6,1.0,1.12,0.96,D,0,0
2,2019-07-20,2019,Internacional,Grêmio,2.21,3.63,2.91,53.333333,50.000000,10.000000,...,80.0,0.0,20.0,1.0,0.4,1.36,0.94,D,1,1
3,2019-07-20,2019,CSA,Atl Paranaense,3.29,2.2,3.21,20.000000,10.000000,30.000000,...,40.0,0.0,60.0,1.4,1.2,1.60,1.64,A,0,4
4,2019-07-20,2019,Ceará,Palmeiras,4.76,1.81,3.25,36.666667,30.000000,20.000000,...,80.0,20.0,0.0,1.4,0.4,1.60,0.66,H,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082,2022-11-13,2022,Bragantino,Fluminense,2.4,2.74,3.52,39.639640,29.729730,29.729730,...,80.0,20.0,0.0,2.2,0.6,1.86,0.58,A,0,1
1083,2022-11-13,2022,América (MG),Atl Goianiense,1.59,5.14,4.17,46.846847,40.540541,18.918919,...,20.0,40.0,40.0,1.2,1.4,1.26,0.86,D,1,1
1084,2022-11-13,2022,Goiás,São Paulo,4.85,1.69,3.75,41.441441,29.729730,35.135135,...,40.0,20.0,40.0,1.4,1.6,0.70,1.20,A,0,4
1085,2022-11-13,2022,Internacional,Palmeiras,2.48,2.71,3.44,63.063063,51.351351,35.135135,...,80.0,20.0,0.0,2.6,0.6,2.70,0.58,H,3,0


In [11]:
def parse_df_to_csv(dataframe, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    dataframe.to_csv("{}/{}".format(path, filename))

In [12]:
parse_df_to_csv(data_df, f'leagues/{league}/formatted_data', '{}-{}.csv'.format(start_season, end_season))