In [1]:
import yaml
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from skopt.space import Real, Integer
from skopt import BayesSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import tree

from matplotlib import pyplot as plt
from seaborn import heatmap

import pickle

In [2]:
# Read config file
with open("config.yaml", 'r') as configuration:
    config = yaml.safe_load(configuration)

In [3]:
PL_df = pd.read_csv('hist_PremierLeague.csv', sep=',')
BL_df = pd.read_csv('hist_Bundesliga.csv', sep=',')
SP_df = pd.read_csv('hist_LaLiga.csv', sep=',')
IT_df = pd.read_csv('hist_SerieA.csv', sep=',')
FR_df = pd.read_csv('hist_Ligue1.csv', sep=',')

In [4]:
#data = pd.concat([PL_df,BL_df,SP_df,IT_df,FR_df])
#data = data.reset_index(drop=True)

In [5]:
data = FR_df

In [6]:
y_true = data.FTR
data = data.drop(['FTR','Date','HomeTeam','AwayTeam','h_course','d_course','a_course'], axis=1)

In [7]:
# ---------------------------------------
# MODEL COMPONENTS
# Uploading models and components
xgb_model = pickle.load(open('model\\xgb_model.pkl', "rb"))
tree_model = pickle.load(open('model\\tree_model.pkl', "rb"))
dicts2translate = pickle.load(open('model\\dicts2translate.pkl', "rb"))

# Prediction
preds = pd.DataFrame(xgb_model.predict_proba(data))

pred_df = pd.DataFrame()
pred_df['pr_h_won'] = preds[0]
pred_df['pr_draw']  = preds[1]
pred_df['pr_a_won'] = preds[2]

# Add a column with information about who will win the match according to the decision tree
tree_preds = tree_model.predict(preds)
preds_after_translation = [dicts2translate['idx2str'][elem] for elem in tree_preds]
pred_df['prediction'] = preds_after_translation
pred_df = pred_df.round({'pr_h_won': 4, 'pr_draw': 4, 'pr_a_won': 4})
# ---------------------------------------
pred_df

Unnamed: 0,pr_h_won,pr_draw,pr_a_won,prediction
0,0.5922,0.1958,0.2120,H
1,0.2065,0.2035,0.5901,A
2,0.2911,0.2225,0.4864,A
3,0.3762,0.2704,0.3534,D
4,0.4749,0.1822,0.3430,A
...,...,...,...,...
374,0.4113,0.3023,0.2864,D
375,0.5250,0.2443,0.2307,H
376,0.5619,0.2343,0.2038,H
377,0.7782,0.1407,0.0811,H


In [8]:
# BALANCED ACCURACY
print('--- ACCURACY ---')
print('TRAIN:', accuracy_score(y_true, pred_df.prediction).round(2))

print('--- BALANCED ACCURACY ---')
print('TRAIN:', balanced_accuracy_score(y_true, pred_df.prediction).round(2))

--- ACCURACY ---
TRAIN: 0.48
--- BALANCED ACCURACY ---
TRAIN: 0.46
