In [136]:
# https://platform.olimpiada-ai.ro/en/problems/86

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [137]:
train = pd.read_csv("/kaggle/input/fairplay/train.csv")
test = pd.read_csv("/kaggle/input/fairplay/test.csv")

train.shape, test.shape

((8000, 14), (1500, 13))

In [138]:
train.head()

Unnamed: 0,MatchID,Season,MatchWeek,HomeTeam,AwayTeam,FullTimeResult,Goals,Shots,Corners,YellowCards,RedCards,TotalCards,TeamStyles,chaos_label
0,2000-2001_Charlton_Man City,2000-2001,1,Charlton,Man City,H,4,25.0,12.0,3.0,0.0,3.0,"RiskTaker, DirectPlay",0
1,2000-2001_Chelsea_West Ham,2000-2001,1,Chelsea,West Ham,H,6,29.0,14.0,3.0,0.0,3.0,"RiskTaker, HighPressure, ChaosInducer",1
2,2000-2001_Coventry_Middlesbrough,2000-2001,1,Coventry,Middlesbrough,A,4,22.0,12.0,8.0,1.0,10.0,"AggressiveTackler, ChaosInducer, LowTempoContr...",1
3,2000-2001_Derby_Southampton,2000-2001,1,Derby,Southampton,D,4,19.0,13.0,2.0,0.0,2.0,"Disciplined, LowTempoController",0
4,2000-2001_Leeds_Everton,2000-2001,1,Leeds,Everton,H,2,29.0,10.0,4.0,0.0,4.0,"HighPressure, Opportunistic",0


In [139]:
train['chaos_label'].value_counts(normalize=True)

chaos_label
0    0.74275
1    0.25725
Name: proportion, dtype: float64

In [140]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MatchID         8000 non-null   object 
 1   Season          8000 non-null   object 
 2   MatchWeek       8000 non-null   int64  
 3   HomeTeam        8000 non-null   object 
 4   AwayTeam        8000 non-null   object 
 5   FullTimeResult  8000 non-null   object 
 6   Goals           8000 non-null   int64  
 7   Shots           8000 non-null   float64
 8   Corners         8000 non-null   float64
 9   YellowCards     8000 non-null   float64
 10  RedCards        8000 non-null   float64
 11  TotalCards      8000 non-null   float64
 12  TeamStyles      8000 non-null   object 
 13  chaos_label     8000 non-null   int64  
dtypes: float64(5), int64(3), object(6)
memory usage: 875.1+ KB


In [141]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MatchID         1500 non-null   object 
 1   Season          1500 non-null   object 
 2   MatchWeek       1500 non-null   int64  
 3   HomeTeam        1500 non-null   object 
 4   AwayTeam        1500 non-null   object 
 5   FullTimeResult  1500 non-null   object 
 6   Goals           1500 non-null   int64  
 7   Shots           1500 non-null   float64
 8   Corners         1500 non-null   float64
 9   YellowCards     1500 non-null   float64
 10  RedCards        1500 non-null   float64
 11  TotalCards      1500 non-null   float64
 12  TeamStyles      1500 non-null   object 
dtypes: float64(5), int64(2), object(6)
memory usage: 152.5+ KB


In [142]:
train['Season'].value_counts(normalize=True)

Season
2000-2001    0.0475
2001-2002    0.0475
2002-2003    0.0475
2003-2004    0.0475
2004-2005    0.0475
2005-2006    0.0475
2006-2007    0.0475
2007-2008    0.0475
2008-2009    0.0475
2009-2010    0.0475
2010-2011    0.0475
2011-2012    0.0475
2012-2013    0.0475
2013-2014    0.0475
2014-2015    0.0475
2015-2016    0.0475
2016-2017    0.0475
2017-2018    0.0475
2018-2019    0.0475
2019-2020    0.0475
2020-2021    0.0475
2021-2022    0.0025
Name: proportion, dtype: float64

In [143]:
train['HomeTeam'].value_counts(normalize=True)

HomeTeam
Chelsea             0.050000
Man United          0.050000
Arsenal             0.050000
Liverpool           0.050000
Tottenham           0.050000
Everton             0.050000
Man City            0.047625
Newcastle           0.045250
West Ham            0.042875
Aston Villa         0.042875
Fulham              0.035625
Southampton         0.033375
Sunderland          0.033250
West Brom           0.030875
Blackburn           0.026125
Bolton              0.026125
Leicester           0.023875
Middlesbrough       0.023750
Stoke               0.023750
Crystal Palace      0.021500
Wigan               0.019000
Burnley             0.016750
Wolves              0.016750
Swansea             0.016625
Charlton            0.016625
Portsmouth          0.016625
Birmingham          0.016625
Norwich             0.014375
Watford             0.014375
Leeds               0.012000
Hull                0.011875
Bournemouth         0.011875
Brighton            0.009625
Reading             0.007125
Derby

In [144]:
train['FullTimeResult'].value_counts(normalize=True)

FullTimeResult
H    0.460625
A    0.289375
D    0.250000
Name: proportion, dtype: float64

In [145]:
styles = set()

for ls in train['TeamStyles'].apply(lambda x: [s.strip() for s in x.split(',')]):
    styles.update(ls)
for ls in test['TeamStyles'].apply(lambda x: [s.strip() for s in x.split(',')]):
    styles.update(ls)

styles = list(styles)
print(styles)

['Opportunistic', 'AggressiveTackler', 'HighPressure', 'RiskTaker', 'Disciplined', 'LowTempoController', 'ChaosInducer', 'DirectPlay']


In [146]:
def process_df(df):
    temp_df = pd.DataFrame()

    for s in styles:
        temp_df[f'{s}_present'] = df['TeamStyles'].apply(lambda x: [s.strip() for s in x.split(',')]).apply(lambda x: 1 if s in x else 0)
    
    return pd.concat([df, temp_df], axis=1)

In [147]:
train = process_df(train)
test = process_df(test)

In [148]:
from sklearn.model_selection import train_test_split

features = [c for c in train.columns if c not in ['MathcID', 'chaos_label', 'TeamStyles']]
cat_features = [c for c in train.select_dtypes('object') if c in features]
target_col = 'chaos_label'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
X_train.shape, X_valid.shape

((7200, 20), (800, 20))

In [149]:
from catboost import Pool

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
full_pool = Pool(X, y, cat_features=cat_features)

In [150]:
from catboost import CatBoostClassifier

params = {
    'iterations': 1000,
    'loss_function': 'Logloss',
    'eval_metric': 'TotalF1:average=Macro',
    'metric_period': 100,
    'max_depth': 2,
    'random_state': 42
}

model = CatBoostClassifier(**params)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.051602
0:	learn: 0.8483577	test: 0.8208101	best: 0.8208101 (0)	total: 4.46ms	remaining: 4.45s
100:	learn: 0.9346988	test: 0.9105907	best: 0.9105907 (100)	total: 330ms	remaining: 2.93s
200:	learn: 0.9388584	test: 0.9177632	best: 0.9177632 (200)	total: 690ms	remaining: 2.74s
300:	learn: 0.9403040	test: 0.9196195	best: 0.9196195 (300)	total: 1.06s	remaining: 2.47s
400:	learn: 0.9420263	test: 0.9217448	best: 0.9217448 (400)	total: 1.43s	remaining: 2.14s
500:	learn: 0.9428307	test: 0.9235797	best: 0.9235797 (500)	total: 1.81s	remaining: 1.8s
600:	learn: 0.9439408	test: 0.9238448	best: 0.9238448 (600)	total: 2.19s	remaining: 1.45s
700:	learn: 0.9454612	test: 0.9238448	best: 0.9238448 (600)	total: 2.56s	remaining: 1.09s
800:	learn: 0.9468239	test: 0.9222868	best: 0.9238448 (600)	total: 2.91s	remaining: 724ms
900:	learn: 0.9477759	test: 0.9222868	best: 0.9238448 (600)	total: 3.28s	remaining: 360ms
999:	learn: 0.9487080	test: 0.9241068	best: 0.9241068 (999)	total: 3.64s	r

<catboost.core.CatBoostClassifier at 0x7fc2e106d490>

In [151]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_valid)
score = f1_score(y_valid, y_pred, average='macro')

print(f'F1 Macro: {score:.5f}')

F1 Macro: 0.92411


In [152]:
model = CatBoostClassifier(**params)

model.fit(full_pool)

Learning rate set to 0.025035
0:	learn: 0.8350629	total: 4.47ms	remaining: 4.46s
100:	learn: 0.9308941	total: 347ms	remaining: 3.09s
200:	learn: 0.9350803	total: 703ms	remaining: 2.79s
300:	learn: 0.9354368	total: 1.08s	remaining: 2.5s
400:	learn: 0.9379804	total: 1.46s	remaining: 2.18s
500:	learn: 0.9388701	total: 1.84s	remaining: 1.83s
600:	learn: 0.9392461	total: 2.21s	remaining: 1.47s
700:	learn: 0.9394236	total: 2.58s	remaining: 1.1s
800:	learn: 0.9396215	total: 2.96s	remaining: 736ms
900:	learn: 0.9399560	total: 3.35s	remaining: 368ms
999:	learn: 0.9401333	total: 3.73s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fc2e12f5010>

In [153]:
y_pred = model.predict(X_test)

In [154]:
((train['HomeTeam'] == 'Chelsea') | (train['AwayTeam'] == 'Chelsea')).sum()

np.int64(800)

In [155]:
((test['HomeTeam'] == 'Chelsea') | (test['AwayTeam'] == 'Chelsea')).sum()

np.int64(150)

In [156]:
aggresives = ['AggressiveTackler', 'RiskTaker', 'HighPressure', 'ChaosInducer']

temp_df = pd.DataFrame()

for ag in aggresives:
    temp_df[ag] = test['TeamStyles'].apply(lambda x: [s.strip() for s in x.split(',')]).apply(lambda x: 1 if ag in x else 0)

aggresive_answers = temp_df.sum(axis=1) / test['TeamStyles'].apply(lambda x: len(x.split(',')))

In [157]:
subm = pd.DataFrame({
    'subtaskID': [1] + [2] * len(test) + [3] * len(test),
    'datapointID': [1] + test['MatchID'].tolist() + test['MatchID'].tolist(),
    'answer': [150] + aggresive_answers.tolist() + y_pred.tolist()
})

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,150.0
1,2,2021-2022_Man City_Arsenal,0.75
2,2,2021-2022_Aston Villa_Brentford,1.0
3,2,2021-2022_Brighton_Everton,0.0
4,2,2021-2022_Newcastle_Southampton,1.0
