In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd

pd.options.display.max_columns, pd.options.display.max_rows = 500, 200
pd.options.display.float_format = '{:.3f}'.format
import joblib
import warnings
warnings.filterwarnings('ignore')

from soccer.preprocessing import DataPreprocessor
from soccer.glicko_soccer import GlickoSoccer

Preprocessing

In [2]:
raw_matches = pd.read_csv('data/matches.csv')
raw_matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,notes,country,league,season,tournament_type
0,05.11.2011,Kalju,Paide,1,2,,Estonia,Meistriliiga,2011,first
1,05.11.2011,Levadia,Kuressaare,4,0,,Estonia,Meistriliiga,2011,first
2,05.11.2011,Narva,Sillamae,2,3,,Estonia,Meistriliiga,2011,first
3,05.11.2011,Tammeka,Lasnamae,9,0,,Estonia,Meistriliiga,2011,first
4,05.11.2011,Viljandi,Flora,2,4,,Estonia,Meistriliiga,2011,first


In [3]:
matches = DataPreprocessor(is_actual_draw_predictions=True).preprocessing(raw_matches)
matches.tail(5)

Unnamed: 0,match_id,date,home_team,away_team,home_score,away_score,country,league,season,tournament_type,tournament,outcome,is_pandemic,draw_probability
370292,274719,2022-06-05,Jedinstvo Bihac,Gradina Srebrenik,1,0,Bosnia And Herzegovina,Prva Liga - FBiH,2021,2,Bosnia And Herzegovina. Second,H,0,0.235
370297,294572,2022-06-11,Girona,Tenerife,0,0,Spain,LaLiga2 - Promotion - Play Offs,2021,2,Spain. Second,D,0,0.279
370298,108308,2022-06-11,Veria,Lamia,1,2,Greece,Super League - Relegation,2021,1,Greece. First,A,0,0.329
370299,108307,2022-06-18,Lamia,Veria,1,1,Greece,Super League - Relegation,2021,1,Greece. First,D,0,0.281
370300,294571,2022-06-19,Tenerife,Girona,1,3,Spain,LaLiga2 - Promotion - Play Offs,2021,2,Spain. Second,A,0,0.305


Fit parameters

In [None]:
matches = DataPreprocessor(is_actual_draw_predictions=True, max_season=2019).preprocessing()

league_params = GlickoSoccer().fit_params(matches, number_iterations=100, is_params_initialization=False)
league_params

Ratings

In [4]:
league_params = joblib.load('data/league_params.pkl')
club_ratings = GlickoSoccer().rate_teams(matches, league_params)
club_ratings = GlickoSoccer().ratings_to_df(club_ratings, matches)
europe_leagues_ratings, south_america_leagues_ratings = GlickoSoccer().league_ratings(club_ratings, number_top_teams=5)
club_ratings.head(30)

Unnamed: 0,#,team,rating,league,is_europe
0,1,Liverpool,2324.768,England. First,True
1,2,Manchester City,2322.166,England. First,True
2,3,Real Madrid,2286.032,Spain. First,True
3,4,Bayern Munich,2255.672,Germany. First,True
4,5,Inter,2225.255,Italy. First,True
5,6,Chelsea,2217.639,England. First,True
6,7,Atl. Madrid,2170.242,Spain. First,True
7,8,FC Porto,2169.666,Portugal. First,True
8,9,Barcelona,2165.708,Spain. First,True
9,10,AC Milan,2159.974,Italy. First,True


In [5]:
south_america_leagues_ratings

Unnamed: 0,#,league,rating
0,1,England. First,2222.616
1,2,Spain. First,2173.791
2,3,Italy. First,2134.528
3,4,Germany. First,2125.74
4,5,France. First,2047.183
5,6,Portugal. First,2003.65
6,7,Netherlands. First,1969.51
7,8,Spain. Second,1927.749
8,9,Austria. First,1927.117
9,10,Ukraine. First,1917.165


In [6]:
south_america_leagues_ratings

Unnamed: 0,#,league,rating
0,1,Brazil. First,1881.52
1,2,Argentina. First,1822.127
2,3,Ecuador. First,1739.322
3,4,Colombia. First,1731.711
4,5,Paraguay. First,1720.733
5,6,Uruguay. First,1684.345
6,7,Brazil. Second,1670.574
7,8,Argentina. Second,1658.215
8,9,Bolivia. First,1651.507
9,10,Chile. First,1639.772


In [6]:
from soccer.outcomes_features import TrainCreator
from soccer.outcomes_catboost import OutcomesCatBoost

train, validation, test = TrainCreator().train_validation_test(matches)

NameError: name 'CatBoost' is not defined

In [None]:
OutcomesCatBoost().cross_val_score()

In [None]:
experiments = OutcomesCatBoost().optuna_optimization(1000)

In [None]:
experiments.head(200)

In [None]:
OutcomesCatBoost().save_model()

In [None]:
predictions = OutcomesCatBoost().predict()

In [None]:
predictions = predictions.merge(matches, on=['match_id'])
predictions.head(20)

Compare Models

In [9]:
from soccer.compare_predictions import compare_predictions

predictions = compare_predictions(2020)
predictions.head()

Catboost Loss: 0.5894359213389553
Glicko Loss: 0.584571185497871


Unnamed: 0,match_id,season,date,home_team,away_team,outcome,home_win_cb,draw_cb,away_win_cb,home_win_glicko,draw_glicko,away_win_glicko
0,7403,2020,2020-01-14,Nacional Uruguay,Wanderers,H,0.62,0.216,0.163,0.723,0.166,0.111
1,205262,2020,2020-01-16,Danubio,Cerro Largo,A,0.309,0.279,0.412,0.334,0.284,0.382
2,205261,2020,2020-01-16,CA Cerro,Penarol,D,0.301,0.295,0.404,0.156,0.221,0.623
3,205259,2020,2020-01-17,Fenix,River Plate Uruguay,D,0.357,0.272,0.371,0.341,0.283,0.376
4,205260,2020,2020-01-17,Progreso,Montevideo City,D,0.494,0.262,0.243,0.573,0.237,0.189


In [7]:
from soccer.compare_predictions import compare_predictions

predictions = compare_predictions(start_season=2020)
predictions.head()

Catboost Loss: 0.5894359213389553
Glicko-2 Loss: 0.584571185497871
Original Glicko-2 Loss: 0.5933047650134288


Unnamed: 0,match_id,season,date,home_team,away_team,outcome,home_win_cb,draw_cb,away_win_cb,home_win_glicko,draw_glicko,away_win_glicko,home_win_original_glicko,draw_original_glicko,away_win_original_glicko
0,7403,2020,2020-01-14,Nacional Uruguay,Wanderers,H,0.62,0.216,0.163,0.723,0.166,0.111,0.629,0.212,0.158
1,205262,2020,2020-01-16,Danubio,Cerro Largo,A,0.309,0.279,0.412,0.334,0.284,0.382,0.296,0.272,0.431
2,205261,2020,2020-01-16,CA Cerro,Penarol,D,0.301,0.295,0.404,0.156,0.221,0.623,0.156,0.21,0.634
3,205259,2020,2020-01-17,Fenix,River Plate Uruguay,D,0.357,0.272,0.371,0.341,0.283,0.376,0.296,0.272,0.431
4,205260,2020,2020-01-17,Progreso,Montevideo City,D,0.494,0.262,0.243,0.573,0.237,0.189,0.493,0.26,0.247
