In [78]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor, XGBRegressor, XGBClassifier, XGBRFClassifier

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

## ML классификация

После неудачи с регрессионными моделями, мы решили просто предсказывать победу одной из команд или ничью

In [62]:
match = pd.read_csv('match_ml.csv') # https://disk.yandex.ru/d/riyyxNXKSEKxVQ

In [63]:
match.drop(columns='Unnamed: 0', inplace=True)

In [64]:
match.head()

Unnamed: 0,stage,height_hp1,weight_hp1,age_hp1,overall_rating_hp1,sprint_speed_hp1,gk_diving_hp1,gk_handling_hp1,gk_kicking_hp1,gk_positioning_hp1,...,age_ap11,overall_rating_ap11,dribbling_ap11,preferred_foot_right_ap11,pace_ap11,shooting_ap11,passing_ap11,defending_ap11,physical_ap11,goals_diff
0,1,185.42,183,21.0,68.0,43.0,72.0,63.0,71.0,64.0,...,20.0,62.0,67.45,0,71.6,49.75,55.65,31.5,47.35,2
1,1,187.96,181,21.0,69.0,27.0,69.0,65.0,66.0,69.0,...,24.0,62.0,59.15,1,63.2,64.6,61.55,35.2,70.25,-1
2,2,193.04,192,23.0,65.0,29.0,69.0,64.0,60.0,63.0,...,24.0,69.0,73.1,0,87.45,61.1,59.55,29.8,53.5,1
3,2,185.42,190,31.0,68.0,43.0,71.0,63.0,62.0,66.0,...,26.0,67.0,68.7,1,83.9,62.75,53.35,31.1,72.45,0
4,3,187.96,181,21.0,69.0,27.0,69.0,65.0,66.0,69.0,...,26.0,67.0,68.7,1,83.9,62.75,53.35,31.1,72.45,1


In [65]:
match['match_status'] = match['goals_diff'].apply(lambda x: 0 if x < 0 else (2 if x > 0 else 1))

In [66]:
match.drop(columns=['goals_diff'], inplace=True)

Здесь 0 - победа гостей, 1 - ничья, 2 - поебда домашней команды

In [67]:
match['match_status'].value_counts()

2    3939
0    2621
1    2185
Name: match_status, dtype: int64

In [68]:
y = match['match_status']
X = match.drop(columns=['match_status'])

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.2)

In [73]:
def metrics(y_true, y_pred):
    return {roc_auc_score(y_test_bin, y_pred_proba, average='macro', multi_class='ovr')}

In [80]:
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_train_bin = label_binarize(y_train, classes=[0, 1, 2])

## LogisticRegression

Построим обычную логрегрессию, так как у нас больше двух классов, то здесь и более будем использовать подход 'One vs Rest', а измерять точность модели с помощью 

In [52]:
logreg = LogisticRegression(multi_class='ovr', max_iter=20000)
logreg.fit(X_train, y_train)

In [53]:
logreg_metric_train = metrics(y_train, logreg.predict(X_train))
logreg_metric_test = metrics(y_test, logreg.predict(X_test))

In [81]:
print('Лог регрессия на трейне:')
print(roc_auc_score(y_train_bin, logreg.predict_proba(X_train), average='macro', multi_class='ovr'))
print('Лог регрессия на тесте:')
print(roc_auc_score(y_test_bin, logreg.predict_proba(X_test), average='macro', multi_class='ovr'))

Лог регрессия на трейне:
0.7019500455131555
Лог регрессия на тесте:
0.6289126765859069


Такие результаты нас уже намного больше устраивают, но мы пойдем дальше

## XGBClassifier

In [82]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)

In [101]:
print('XGB классификация на трейне:')
print(roc_auc_score(y_train_bin, xgb.predict_proba(X_train), average='macro', multi_class='ovr'))
print('XGB классификация на тесте:')
print(roc_auc_score(y_test_bin, xgb.predict_proba(X_test), average='macro', multi_class='ovr'))

XGB классификация на трейне:
1.0
XGB классификация на тесте:
0.6205257944917115


К сожалению XGB сильно переобучается, так что попробуем построить лес.

In [84]:
xgbrfc = XGBRFClassifier()
xgbrfc.fit(X_train, y_train)

In [85]:
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_train_bin = label_binarize(y_train, classes=[0, 1, 2])

In [86]:
print('XGBRF классификация на трейне:')
print(roc_auc_score(y_train_bin, xgbrfc.predict_proba(X_train), average='macro', multi_class='ovr'))
print('XGBRF классификация на тесте:')
print(roc_auc_score(y_test_bin, xgbrfc.predict_proba(X_test), average='macro', multi_class='ovr'))

XGBRF классификация на трейне:
0.8616799965492149
XGBRF классификация на тесте:
0.6297406496159296


теперь попробуем посикать новые параметры с помощью GreedSearchCV

In [89]:
params = {
    'n_estimators': [50, 100, 150, 200, 300, 500],
    'max_depth': [None, 3, 5, 7, 9],
    'eta': [0.5, 1, 2, 3]
}
grid_search = GridSearchCV(XGBRFClassifier(), params, n_jobs=-1, cv=5)

grid_search.fit(X_train, y_train)

In [95]:
grid_search.best_params_

{'eta': 0.5, 'max_depth': None, 'n_estimators': 100}

In [96]:
xgbrfc_cv = grid_search.best_estimator_

In [97]:
print('XGBRF классификация на трейне:')
print(roc_auc_score(y_train_bin, xgbrfc_cv.predict_proba(X_train), average='macro', multi_class='ovr'))
print('XGBRF классификация на тесте:')
print(roc_auc_score(y_test_bin, xgbrfc_cv.predict_proba(X_test), average='macro', multi_class='ovr'))

XGBRF классификация на трейне:
0.8616799965492149
XGBRF классификация на тесте:
0.6297406496159296


In [100]:
pd.DataFrame([grid_search.best_estimator_.feature_importances_], columns=X_train.columns)

Unnamed: 0,stage,height_hp1,weight_hp1,age_hp1,overall_rating_hp1,sprint_speed_hp1,gk_diving_hp1,gk_handling_hp1,gk_kicking_hp1,gk_positioning_hp1,...,weight_ap11,age_ap11,overall_rating_ap11,dribbling_ap11,preferred_foot_right_ap11,pace_ap11,shooting_ap11,passing_ap11,defending_ap11,physical_ap11
0,0.001251,0.001192,0.001631,0.001483,0.017718,0.001904,0.002776,0.003097,0.003278,0.003044,...,0.003375,0.00313,0.005333,0.004038,0.002526,0.003664,0.003716,0.004752,0.003411,0.003915


видим, что значимость остальных переменных помимо общего рейтинга игрока не так важна, так почему бы нам не убрать эти переменные и тем самым избежать переобучения моделей, об этом наш следющий файл с ML