In [4]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

pd.options.display.max_columns, pd.options.display.max_rows = 500, 200
pd.options.display.float_format = '{:.1f}'.format
import warnings

warnings.filterwarnings('ignore')
import joblib

from euro_soccer.preprocessing import DataPreprocessor
from euro_soccer.glicko_soccer import GlickoSoccer

In [5]:
matches = pd.read_csv('matches.csv')
matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,notes,country,league,season,tournament_type
0,03.08.2019,Влазния,Бесалиджа Лежа,3,1,,Albania,Суперлига - Борьба за выживание - плей-офф,2019-2020,first
1,29.07.2019,Бесалиджа Лежа,Поградец,3,2,,Albania,Суперлига - Борьба за выживание - плей-офф,2019-2020,first
2,30.05.2021,Кастриоти,Томори Берат,2,1,,Albania,Суперлига - Борьба за выживание - плей-офф,2020-2021,first
3,26.05.2021,Буррели,Томори Берат,1,2,,Albania,Суперлига - Борьба за выживание - плей-офф,2020-2021,first
4,23.05.2022,Кораби Пешкопи,Аполония Фиери,3,2,,Albania,Суперлига - Борьба за выживание - плей-офф,2021-2022,first


In [None]:
matches = pd.read_csv('matches.csv')
matches = DataPreprocessor(is_actual_draw_predictions=False).preprocessing(matches)
matches.tail(5)

In [3]:
matches.shape

(122271, 12)

In [21]:
count_matches, count_seasons, count_tournaments, teams_countries, uefa_teams_without_national = DataPreprocessor().test_data(matches)

In [22]:
count_matches

Unnamed: 0,country,tournament_type,season,number_matches


In [23]:
count_seasons

Unnamed: 0,country,tournament_type,number_seasons
60,Europa Conference League,3,1
61,Europa Conference League,4,1


In [24]:
count_tournaments

Unnamed: 0,country,number_tournaments,tournament_type


In [25]:
teams_countries

Unnamed: 0,home_team,teams_countries


In [26]:
uefa_teams_without_national

[]

In [11]:
matches.loc[matches['tournament_type'] == "cups", 'country'].value_counts()

Europa League               2042
Belgium                     1549
Slovakia                    1107
Champions League            1040
France                       950
Turkey                       848
Portugal                     824
Hungary                      708
Czech Republic               667
Spain                        604
Sweden                       593
Norway                       572
Denmark                      519
Romania                      516
Netherlands                  494
Greece                       471
England                      463
Wales                        460
Scotland                     458
Estonia                      449
Northern Ireland             444
Europa Conference League     422
Iceland                      370
Italy                        361
Poland                       342
Austria                      315
Germany                      314
Finland                      305
Switzerland                  288
Albania                      282
Luxembourg

In [38]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116731 entries, 0 to 130989
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   index             116731 non-null  int64         
 1   date              116731 non-null  datetime64[ns]
 2   home_team         116731 non-null  object        
 3   away_team         116731 non-null  object        
 4   country           116731 non-null  object        
 5   tournament        116731 non-null  object        
 6   season            116731 non-null  int64         
 7   tournament_type   116731 non-null  int64         
 8   outcome           116731 non-null  object        
 9   is_pandemic       116731 non-null  int64         
 10  draw_probability  116731 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4), object(5)
memory usage: 10.7+ MB


In [39]:
# matches['draw_probability'].mean()

In [23]:
# # matches.loc[(matches['home_team'].str.contains('B')) & (matches['away_team'].str.contains('AS Roma'))]

In [24]:
# matches.info()

In [25]:
# matches['season'].value_counts()

In [26]:
# matches.loc[matches['tournament'].str.contains('Con')]

# Fit Params

In [None]:
glicko = GlickoSoccer()  # 42049.32433702407
league_params = glicko.fit_params(matches, 20, is_params_initialization=False)

Current Loss: 42044.85784972906
Slovenia. First 1473
Armenia. First 1554
Hungary. First 1649
Bulgaria. First 1439
Portugal. First 1829
Croatia. Second 1324
Spain. Second 1775
Moldova. First 1285
Cyprus. First 1619
Russia. First 1649
Croatia. First 1664
England. First 2014
Ukraine. Second 1399
Northern Ireland. Second 1067
North Macedonia. Second 1028
Sweden. Second 1528
Israel. First 1572
Northern Ireland. First 1377
Scotland. Second 1370
Switzerland. Second 1667
Georgia. First 1357
Portugal. Second 1665
Albania. First 1293
Armenia. Second 1253
Scotland. First 1653
Israel. Second 1425
Belarus. Second 1093
Serbia. First 1433
Austria. Second 1627
Greece. Second 1550
Serbia. Second 1247
Germany. First 1982
Georgia. Second 1432
Bosnia And Herzegovina. Second 1294
Bulgaria. Second 1272
Latvia. First 1437
Netherlands. First 1762
Kazakhstan. First 1432
Albania. Second 1257
Montenegro. First 1337
North Macedonia. First 1282
Belarus. First 1347
Lithuania. Second 1157
Ukraine. First 1597
Belgium

{'init_mu': 1645,
 'init_rd': 150,
 'update_rd': 30,
 'lift_update_mu': 70,
 'home_advantage': 30,
 'pandemic_home_advantage': 24,
 'new_team_update_mu': 0}

# Ratings

In [29]:
glicko = GlickoSoccer()
league_params = joblib.load('data/league_params.pkl')
ratings = glicko.rate_teams(matches, league_params)
ratings = glicko.ratings_to_df(ratings, matches)
leagues_ratings = glicko.league_ratings(ratings, number_top_teams=10)
leagues_ratings

Unnamed: 0,#,league,rating
0,1,England. First,2182.6
1,2,Spain. First,2125.8
2,3,Germany. First,2111.3
3,4,Italy. First,2098.4
4,5,France. First,2035.6
5,6,Portugal. First,1985.5
6,7,Netherlands. First,1936.3
7,8,Belgium. First,1895.2
8,9,Spain. Second,1881.4
9,10,Germany. Second,1864.6


In [32]:
pd.concat([leagues_ratings[0:25].reset_index(drop=True),
           leagues_ratings[25:50].reset_index(drop=True)], axis=1)

Unnamed: 0,#,league,rating,#.1,league.1,rating.1
0,1,England. First,2182.6,26,Portugal. Second,1737.1
1,2,Spain. First,2125.8,27,Italy. Second,1736.8
2,3,Germany. First,2111.3,28,France. Second,1728.6
3,4,Italy. First,2098.4,29,Israel. First,1728.6
4,5,France. First,2035.6,30,Hungary. First,1696.7
5,6,Portugal. First,1985.5,31,Romania. First,1694.1
6,7,Netherlands. First,1936.3,32,Switzerland. Second,1666.6
7,8,Belgium. First,1895.2,33,Serbia. First,1655.8
8,9,Spain. Second,1881.4,34,Austria. Second,1634.0
9,10,Germany. Second,1864.6,35,Russia. Second,1630.5


In [31]:
pd.concat([leagues_ratings[50:74].reset_index(drop=True),
           leagues_ratings[74:].reset_index(drop=True)], axis=1)

Unnamed: 0,#,league,rating,#.1,league.1,rating.1
0,51,Romania. Second,1537.9,75,Czech Republic. Second,1380.6
1,52,Kazakhstan. First,1537.7,76,Faroe Islands. First,1370.3
2,53,Ireland. First,1531.3,77,Bosnia And Herzegovina. Second,1355.6
3,54,Denmark. Second,1527.6,78,Wales. First,1347.7
4,55,Belgium. Second,1522.9,79,Serbia. Second,1343.2
5,56,Albania. First,1517.0,80,Andorra. First,1341.1
6,57,Belarus. First,1515.9,81,Kazakhstan. Second,1322.3
7,58,Northern Ireland. First,1512.8,82,Georgia. Second,1306.2
8,59,Luxembourg. First,1486.3,83,Bulgaria. Second,1301.4
9,60,Malta. First,1484.9,84,Slovakia. Second,1299.8


In [9]:
leagues_ratings.to_csv('20.csv', index=False)

In [6]:
ratings.head(50)

Unnamed: 0,#,team,rating,league
0,1,Ливерпуль,2226.6,England. First
1,2,Манчестер Сити,2209.9,England. First
2,3,Порту,2143.5,Portugal. First
3,4,Реал Мадрид,2141.1,Spain. First
4,5,Бавария,2127.8,Germany. First
5,6,Интер,2124.4,Italy. First
6,7,Аякс,2106.9,Netherlands. First
7,8,Челси,2102.0,England. First
8,9,Милан,2070.3,Italy. First
9,10,Спортинг,2063.9,Portugal. First


# Skellam

In [9]:
# from euro_soccer.draw_model import DrawLightGBM

DrawLightGBM().cross_val_score()

OSError: dlopen(/Users/andreyshelopugin/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/andreyshelopugin/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [5]:
# import joblib
train = joblib.load('data/train.pkl')
train.head()

Unnamed: 0,team,score,index,is_home,avg_scoring_10,avg_scoring_20,is_pandemic,mean_score_10,median_score_10,mean_score_20,mean_score_10_against,median_score_10_against,mean_score_20_against
129670,Den Bosch,2,69077,0,2.5,2.5,1,1.8,1.5,1.5,1.7,1.0,2.0
4917,Dubnica,5,67608,1,1.444444,,0,0.777778,1.0,1.0,3.222222,3.0,3.0
17732,Rapid Vienna,2,27656,1,2.4,2.4,0,1.1,0.5,1.0,0.6,0.0,1.0
132205,Chambly,2,51277,0,1.2,1.2,1,1.1,0.5,1.0,0.8,0.5,1.0
115031,Tobol,0,10064,0,1.3,1.3,0,1.2,1.0,1.0,0.3,0.0,0.0


In [6]:
# train.loc[train['team'] == 'Manchester City']

Unnamed: 0,team,score,index,is_home,avg_scoring_10,avg_scoring_20,is_pandemic,mean_score_10,median_score_10,mean_score_20,mean_score_10_against,median_score_10_against,mean_score_20_against
99593,Manchester City,2,8107,0,2.8,2.8,0,1.700000,1.5,2.0,1.000000,1.0,0.5
97208,Manchester City,1,75728,0,2.7,2.7,0,1.900000,2.0,2.0,1.300000,1.5,1.0
101412,Manchester City,3,8079,0,2.6,2.6,0,1.700000,1.5,2.0,0.800000,0.5,0.5
22198,Manchester City,1,8075,1,1.5,1.5,0,3.300000,3.0,3.0,0.900000,0.5,1.0
112351,Manchester City,3,89736,0,1.7,1.7,0,2.500000,2.5,2.0,0.700000,0.5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139024,Manchester City,4,82632,0,1.4,1.4,1,1.100000,1.0,1.0,0.400000,0.0,0.5
51661,Manchester City,2,82643,1,3.1,3.1,1,2.900000,2.5,2.0,0.600000,0.0,1.0
89840,Manchester City,3,80587,0,2.6,2.6,0,1.666667,1.0,1.0,1.777778,1.0,2.0
16367,Manchester City,3,8163,1,2.2,2.2,0,3.200000,3.0,3.0,0.900000,1.0,1.0


In [7]:
# train['avg_scoring_10'].mean()

1.364716147298895