<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_live_validation_experimental_heft_3_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[CatBoost - An In-Depth Guide [Python API]](https://coderzcolumn.com/tutorials/machine-learning/catboost-an-in-depth-guide-python#9)<br>
[Catboost](https://catboost.ai/en/docs/concepts/python-reference_pool)<br>
[Cross-Validation Techniques](https://medium.com/geekculture/cross-validation-techniques-33d389897878)

### Project config

In [1]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw
     

In [2]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [3]:
!pip install catboost >> None

### Downloads

In [4]:
data_version = 'football_live_npz_230131/'
project = neptune.init_project(
    project="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'dataset'].download('./dataset.npz')
project[data_version + 'description'].download('./save_discription.txt')
project[data_version + 'additional_data'].download('./additional_data.npz')
project[data_version + 'time'].download('./time.csv')
params = project[data_version + 'params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [5]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.3.5
1.21.6


In [6]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [7]:
from catboost import CatBoost
from catboost import utils
from catboost import CatBoostClassifier, CatBoostRegressor
from catboost import Pool, cv
from catboost.utils import eval_metric
np.random.seed(147)

In [8]:
from tqdm import tqdm
from scipy.stats import poisson
import os, psutil

In [9]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.metrics import multilabel_confusion_matrix

### Code

#### Functions

In [10]:
def calculate_multiclass(probability_3x:np.array, line_3x:np.array):
    probline = probability_3x * line_3x
    best_odd_result = np.argmax(probline, axis = 1)
    best_odd_float = np.take_along_axis(
        probline,
        best_odd_result.reshape(-1, 1),
        axis = 1
    )
    return {
        'argmax':best_odd_result,
        'float':best_odd_float
    }

In [11]:
def get_f1_curve(y, y_pred, weighted = False):
    threshold = []
    f1 = []
    if weighted:

        for th in np.linspace(0.3,0.6,26):
            threshold.append(th)
            f1.append(
                f1_score(y_test, (test_preds * [1., 1. + th,  1.]).argmax(axis = 1), average = 'weighted')
                )
    else:
        for th in np.linspace(0,1,26):
            threshold.append(th)
            f1.append(f1_score(y, (y_pred > th).astype(int)))
    return threshold, f1

In [12]:
def plot_f1(y_true, x_predicted, data_split = 'train', weighted = False):
    threshold, f1 = get_f1_curve(y_true, x_predicted, weighted)
    fig = px.area(
        x = threshold, y = f1,
        title=f'F1 Curve {data_split}',
        labels=dict(x='threshold', y='F1'),
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.update_layout(
        width = 600,
        title_x=0.5,
        paper_bgcolor='rgb(229, 237, 247)',
        plot_bgcolor='rgb(229, 237, 247)',    
        )
    fig.show()
    return fig

In [13]:
def plot_multi_matrix(y_true, x_pred, data_split = 'train', threshold = ''):
    title_text = 'confusion matrix ' + data_split
    if threshold != '':
        title_text = title_text +' | ' + f'threshold = {threshold}'

    z = np.zeros((3,3))
    for pred_class in range(3):
        for true_class in range(3):
            z[true_class, pred_class] = np.sum((x_pred == pred_class) & (y_true == true_class))
    x = ['Away', 'Draw', 'Home']
    y = ['Away', 'Draw', 'Home']

    fig = px.imshow(z, x=x, y=y, color_continuous_scale='Purples', text_auto=True)
    fig.update_xaxes(title_text = 'Predicted Label')
    fig.update_yaxes(title_text = 'True Label')
    fig.update_layout(
        height = 400,
        width = 600,
        title_text = title_text,
        title_font_size=20,
        title_x=0.5,
        paper_bgcolor='rgb(229, 237, 247)',
        plot_bgcolor='rgb(229, 237, 247)',    
        )
    fig.update_coloraxes(showscale=False)
    fig.show()
    return fig

In [14]:
def plot_confusion_matrix(cfm, data_split = 'train', threshold = ''):
    title_text = 'confusion matrix ' + data_split
    if threshold != '':
        title_text = title_text +' | ' + f'threshold = {threshold}'
    x = ['Away', 'Home']
    y = ['Away', 'Home']
    fig = px.imshow(cfm, x=x, y=y, color_continuous_scale='Purples', text_auto=True)
    fig.update_xaxes(title_text = 'Predicted Label')
    fig.update_yaxes(title_text = 'True Label')
    fig.update_layout(
        height = 400,
        width = 600,
        title_text = title_text,
        title_font_size=20,
        title_x=0.5,
        paper_bgcolor='rgb(229, 237, 247)',
        plot_bgcolor='rgb(229, 237, 247)',    
        )
    fig.update_coloraxes(showscale=False)
    fig.show()
    return fig

In [15]:
def get_profit_validation(y_true, x_predicted, Line_production, model_name, reverse_bet = False):
    '''
    y_true - numpy вектор, shape:(x,) истинные значения в формате 0|1
    x_predicted - numpy вектор, shape:(x,) предикт (probability (float)) в формате 0.
    Line_production - numpy вектор, shape:(x,) вектор коэфициентов в формате float 1.
    '''
    scatters_dicts = dict(
        scatter1 = dict(x = [], y = [], name = '', fill = 'tozeroy', yaxis = '', xaxis = ''),
        scatter2 = dict(x = [], y = [], name = '', line = dict(color='rgb(33,113,181)', dash='dash'), yaxis = '', xaxis = ''),
        scatter3 = dict(x = [], y = [], name = '', line = dict(color='rgb(107,174,214)', dash='dash'), yaxis = '', xaxis = '')
                    )
    qty_color = 'blue'
    prof_qty_color = 'rgb(8,48,107)'
    bet_type_list = ['fixed', 'divk']
    bet_size_list = ['1', '1/K']
    strategy_list = ['simple', 'complex']
    strategy_name_list = ['threshold', 'pred*k']
    domain_list = [[0.55, 1], [0., 0.5]]
    layout_dict = {}
    data_list = []
    title_text = f'Profit & bet qty for validation model in neptune.ai: {model_name}'
    for cnt_str, strategy in enumerate(strategy_list):
        for cnt_bet, bet_type in enumerate(bet_type_list):
            y_anchor = str((9 * cnt_str) + (cnt_bet*3) + 1)
            threshold, profit, bet_qty_list = get_profit_curve(
                y_true, x_predicted, Line_production,
                bet_type = bet_type, strategy = strategy, reverse_bet = reverse_bet)
            xaxis_num = str((2 * cnt_str) + (cnt_bet + 1))
            layout_dict.update({
                    'xaxis' + xaxis_num:{
                        'domain':[0.5 * cnt_bet, 0.5 * cnt_bet + 0.5],
                        'title':f'bet={bet_size_list[cnt_bet]}, strategy:{strategy_name_list[cnt_str]}',
                        'anchor':'y' + y_anchor}
                                })
            for cnt_scatter in range(1, 4):
                yaxis_num = str((9 * cnt_str) + (cnt_bet*3) + cnt_scatter)
                scatter_num = 'scatter' + str(cnt_scatter)
                scatters_dicts[scatter_num]['x'] = threshold
                if cnt_scatter == 1:
                    scatters_dicts[scatter_num]['y'] = profit
                    scatters_dicts[scatter_num]['name'] = 'profit_' +xaxis_num
                    scatters_dicts[scatter_num]['xaxis'] = 'x' + xaxis_num
                    scatters_dicts[scatter_num]['yaxis'] = 'y' + yaxis_num
                    layout_dict.update({
                        'yaxis' + yaxis_num:{
                            'domain':domain_list[cnt_str],
                            'title':'', #'Profit',
                            'range':[-10,int(max(profit) * 1.1)],
                            'anchor':'x' + xaxis_num
                    }})
                elif cnt_scatter == 2:
                    scatters_dicts[scatter_num]['y'] = np.array(profit) / np.array(bet_qty_list)
                    scatters_dicts[scatter_num]['name'] = 'profit_' + xaxis_num
                    scatters_dicts[scatter_num]['xaxis'] = 'x' + xaxis_num
                    scatters_dicts[scatter_num]['yaxis'] = 'y' + yaxis_num
                    layout_dict.update({
                        'yaxis' + yaxis_num:{
                            'domain':domain_list[cnt_str],
                            'title':'', 'zeroline':True,
                            'side':'right', 'anchor':'x' + xaxis_num,
                            'overlaying':'y' + y_anchor}})
                elif cnt_scatter == 3:
                    scatters_dicts[scatter_num]['y'] = np.array(bet_qty_list) / 1000
                    scatters_dicts[scatter_num]['name'] = 'profit_' + xaxis_num
                    scatters_dicts[scatter_num]['xaxis'] = 'x' + xaxis_num
                    scatters_dicts[scatter_num]['yaxis'] = 'y' + yaxis_num
                    layout_dict.update({
                        'yaxis' + yaxis_num:{
                            'domain':domain_list[cnt_str],
                            'visible':False, 'showgrid':True,
                            'side':'right', 'anchor':'x' + xaxis_num,
                            'overlaying':'y' + y_anchor}})
                data_list += [go.Scatter(**scatters_dicts['scatter' + str(cnt_scatter)])]
    layout_dict.update({
        'width':1400,
        'height':800,
        'title_x':0.5,
        'title_text':title_text,
        'paper_bgcolor':'rgb(229, 237, 247)',
        'plot_bgcolor':'rgb(229, 237, 247)',
        'showlegend':False 
                        })
    layout = go.Layout(**layout_dict)
    return go.Figure(data=data_list, layout=layout)
     

In [16]:
def get_profit_curve(y, y_pred, Line_production, bet_type = 'fixed', strategy = 'simple', reverse_bet = False):
    #fixed, divk, divk-1
    #simple, complex
    threshold = []
    profit = []
    bet_qty_list = []
    if bet_type == 'divk':
        profit_size = (Line_production - 1) / Line_production
        bet_size = Line_production
    else:
        profit_size = (Line_production - 1)
        bet_size = Line_production /Line_production
    for th in np.linspace(0,1,1001):
        threshold.append(th)
        if reverse_bet:
            if strategy == 'simple':
                vector_th = y_pred < th
            if strategy == 'complex':
                vector_th = (y_pred * Line_production / 10) < th
            bet_qty_list.append(vector_th.sum())
            profit.append(((-1) * (y[vector_th] - 1) * profit_size[vector_th] + ((-1) * y[vector_th]) / bet_size[vector_th]).sum())
        else:            
            if strategy == 'simple':
                vector_th = y_pred > th
            if strategy == 'complex':
                vector_th = (y_pred * Line_production / 10) > th
            bet_qty_list.append(vector_th.sum())
            profit.append((y[vector_th] * profit_size[vector_th] + (y[vector_th] - 1) / bet_size[vector_th]).sum())

    return threshold, profit, bet_qty_list

In [17]:
def plot_equity_timeline(y_true, y_pred, Line_production,  th, model_name, x_date = np.empty(()), bet_type = 'fixed', 
                strategy = 'simple', data_split = 'validation', reverse_bet = False):\
    #fixed, divk, divk-1
    #simple, complex
    if bet_type == 'divk':
        profit_size = (Line_production - 1) / Line_production
        bet_size = Line_production
    else:
        profit_size = (Line_production - 1)
        bet_size = Line_production /Line_production
    if strategy == 'simple':
        vector_th = y_pred > th
    if strategy == 'complex':
        vector_th = (y_pred * Line_production / 10) > th
    mean_bet = np.mean(1/bet_size[vector_th])
    y = (y_true[vector_th] * profit_size[vector_th] + (y_true[vector_th] - 1) / bet_size[vector_th]).cumsum()
    bet_qty = vector_th.sum()
    title_text = f'Equity Curve {data_split} | threshold={th} | bet_type:{bet_type} | strategy:{strategy}<br>' + \
                f'bet_mean: {np.round(mean_bet, 2)} | ROI: {np.round(y[-1] * 100 / np.sum(1/bet_size[vector_th]), 4)}%' + \
                f' | Bet quantity: {bet_qty}<br>' + f'Model in neptune.ai: FOOT-{model_name}'
    layout_dict = {}
    #fig = go.Figure()
    if x_date.shape:
        trace_equity = go.Scatter(y = y, x= x_date[vector_th], fill='tozeroy', xaxis = 'x1') #'toself'
        layout_dict.update({'xaxis1':{'anchor':'y1', 'showgrid':True, 'ticklabelmode':'period', 'tickformat':'%d\n%b\n%Y'}})
    else:
        trace_equity = go.Scatter(y = y, fill='tozeroy', xaxis = 'x1') #'toself'

    layout_dict.update({
    'width':1400,
    'height':600,
    'title_x':0.5,
    'title_text':title_text,
    'paper_bgcolor':'rgb(229, 237, 247)',
    'plot_bgcolor':'rgb(229, 237, 247)',
    'showlegend':False,
                    })
    data_list = [trace_equity] #, trace_time_dd]
    layout = go.Layout(**layout_dict)
    return go.Figure(data=data_list, layout=layout)

In [18]:
def plot_equity(y_true, y_pred, Line_production, th, model_name, bet_type = 'fixed', 
                strategy = 'simple', data_split = 'validation', reverse_bet = False):\
    #fixed, divk, divk-1
    #simple, complex
    if bet_type == 'divk':
        profit_size = (Line_production - 1) / Line_production
        bet_size = Line_production
    else:
        profit_size = (Line_production - 1)
        bet_size = Line_production /Line_production
    if strategy == 'simple':
        vector_th = y_pred > th
    if strategy == 'complex':
        vector_th = (y_pred * Line_production / 10) > th
    mean_bet = np.mean(1/bet_size[vector_th])
    y = (y_true[vector_th] * profit_size[vector_th] + (y_true[vector_th] - 1) / bet_size[vector_th]).cumsum()
    bet_qty = vector_th.sum()
    title_text = f'Equity Curve {data_split} | threshold={th} | bet_type:{bet_type} | strategy:{strategy}<br>' + \
                f'bet_mean: {np.round(mean_bet, 2)} | ROI: {np.round(y[-1] * 100 / np.sum(1/bet_size[vector_th]), 4)}%' + \
                f' | Bet quantity: {bet_qty}<br>' + f'Model in neptune.ai: FOOT-{model_name}'
    layout_dict = {}
    #fig = go.Figure()
    trace_equity = go.Scatter(y = y, fill='tozeroy', xaxis = 'x1', yaxis = 'y1') #'toself'
    layout_dict.update({'xaxis1':{'anchor':'y1'}, 'yaxis1':{'domain':[0.4, 1]}})
    max_profit = np.maximum.accumulate(y)
    trace_drawdown =  go.Scatter(y = (y - max_profit) / mean_bet, fill='tozeroy', xaxis = 'x2', yaxis = 'y2')
    layout_dict.update({'xaxis2':{'anchor':'y2'}, 'yaxis2':{'domain':[0., 0.35], 'title':'drawdown inmean(bet)'}})

    #cnt = 0
    #dd = []
    #for var1 in (y - max_profit):
    #    if var1 < 0:
    #        cnt += 1
    #    else:
    #        cnt = 0
    #    dd.append(cnt)
    #trace_time_dd = go.Bar(y = dd, xaxis = 'x3', yaxis = 'y3')
    #layout_dict.update({'xaxis3':{'anchor':'y3', 'title':'time from last max in bets'}, 'yaxis3':{'domain':[0., 0.2]}})

    layout_dict.update({
    'width':1400,
    'height':800,
    'title_x':0.5,
    'title_text':title_text,
    'paper_bgcolor':'rgb(229, 237, 247)',
    'plot_bgcolor':'rgb(229, 237, 247)',
    'showlegend':False 
                    })
    data_list = [trace_equity, trace_drawdown] #, trace_time_dd]
    layout = go.Layout(**layout_dict)
    return go.Figure(data=data_list, layout=layout)

#### Load Data

1. регрессия
2. мультиклассовая класификация {AWAY:0,DRAW:1, HOME:2} 
3. бинарная классификация: <br>
    a. HOME vs (DRAW & AWAY)<br>
    б. DRAW vs (HOME & AWAY)<br>
    в. AWAY vs (HOME & DRAW)<br>

In [19]:
dataset_name = './dataset.npz'
data_npz = np.load(dataset_name)
X_train, X_test, X_holdout = data_npz['X_train'], data_npz['X_test'], data_npz['X_holdout']

In [20]:
#'Active.1', 
#'W1', 'WX', 'W2', 
#'X1', 'X2', 'W12', 
#'TotalValue', 'Over', 'Under', 
#'Hand1Value', 'H1', 'H2'

In [21]:
data_npz.files

['X_train',
 'X_test',
 'X_holdout',
 'y_train_bin',
 'y_test_bin',
 'y_holdout_bin',
 'y_train_multi',
 'y_test_multi',
 'y_holdout_multi',
 'y_train_diff',
 'y_test_diff',
 'y_holdout_diff',
 'y_train_regression1',
 'y_train_regression2',
 'y_test_regression1',
 'y_test_regression2',
 'y_holdout_regression1',
 'y_holdout_regression2',
 'score1_train',
 'score2_train',
 'result1_train',
 'result2_train',
 'score1_test',
 'score2_test',
 'result1_test',
 'result2_test',
 'score1_holdout',
 'score2_holdout',
 'result1_holdout',
 'result2_holdout',
 'K_train',
 'K_test',
 'K_holdout']

In [22]:
#@title Выбор таргета
target_type = "regression" #@param ["regression", "multiclass", "binary_home", "binary_draw", "binary_away"]
if target_type == 'regression':
    scores_list = ['score1_train', 'score2_train', 'result1_train', 'result2_train', 'score1_test', 
    'score2_test', 'result1_test', 'result2_test', 'score1_holdout', 'score2_holdout',
    'result1_holdout', 'result2_holdout']
    scores_dict = {}
    for item in scores_list:
        scores, data_type, team = item.split('_')[0][:-1], item.split('_')[1], item.split('_')[0][-1]
        if scores in scores_dict:
            if data_type in scores_dict[scores]:
                scores_dict[scores][data_type][team] = data_npz[item]
            else:
                scores_dict[scores].update({data_type:{team:data_npz[item]}})
        else:
            scores_dict[scores] = {data_type:{team:data_npz[item]}}

    y_train, line_train, active_train =  data_npz['y_train_multi'], data_npz['K_train'][:,1:], data_npz['K_train'][:,0] == 1
    y_train_regr1 = data_npz['y_train_regression1']
    y_train_regr2 = data_npz['y_train_regression2']
    y_test, line_test, active_test  =  data_npz['y_test_multi'], data_npz['K_test'][:,1:], data_npz['K_test'][:,0] == 1
    y_test_regr1 = data_npz['y_test_regression1']
    y_test_regr2 = data_npz['y_test_regression2']
    y_holdout, line_holdout, active_holdout = data_npz['y_holdout_multi'], data_npz['K_holdout'][:,1:], data_npz['K_holdout'][:,0] == 1
    y_holdout_regr1 = data_npz['y_holdout_regression1']
    y_holdout_regr2 = data_npz['y_holdout_regression2']
    zero_train, zero_test, zero_holdout = line_train > 1, line_test > 1, line_holdout > 1

    model_head = 'FOOT-LIVEBST1'
    model_head2 = 'FOOT-LIVEBST2'
elif target_type == 'multiclass':
    y_train, line_train, active_train  =  data_npz['y_train_multi'], data_npz['K_train'][:,1:4], data_npz['K_train'][:,0] == 1
    y_test, line_test, active_test  =  data_npz['y_test_multi'], data_npz['K_test'][:,1:4], data_npz['K_test'][:,0] == 1
    y_holdout, line_holdout, active_holdout = data_npz['y_holdout_multi'], data_npz['K_holdout'][:,1:4], data_npz['K_holdout'][:,0] == 1
    zero_train, zero_test, zero_holdout = line_train > 1, line_test > 1, line_holdout > 1
    model_head = 'FOOT-LIVEMC'
elif target_type == 'binary_home':
    kf_col = 1
    kf_res = 2
    y_train, line_train, active_train  =  1 * (data_npz['y_train_multi'] == kf_res), data_npz['K_train'][:,kf_col], data_npz['K_train'][:,0] == 1
    y_test, line_test, active_test  =  1 * (data_npz['y_test_multi'] == kf_res), data_npz['K_test'][:,kf_col], data_npz['K_test'][:,0] == 1
    y_holdout, line_holdout, active_holdout = 1 * (data_npz['y_holdout_multi'] == kf_res), data_npz['K_holdout'][:,kf_col], data_npz['K_holdout'][:,0] == 1
    zero_train, zero_test, zero_holdout = line_train > 1, line_test > 1, line_holdout > 1
    model_head = 'FOOT-LIVEBC'
elif target_type == 'binary_draw':
    kf_col = 2
    kf_res = 1
    y_train, line_train, active_train  =  1 * (data_npz['y_train_multi'] == kf_res), data_npz['K_train'][:,kf_col], data_npz['K_train'][:,0] == 1
    y_test, line_test, active_test  =  1 * (data_npz['y_test_multi'] == kf_res), data_npz['K_test'][:,kf_col], data_npz['K_test'][:,0] == 1
    y_holdout, line_holdout, active_holdout = 1 * (data_npz['y_holdout_multi'] == kf_res), data_npz['K_holdout'][:,kf_col], data_npz['K_holdout'][:,0] == 1
    zero_train, zero_test, zero_holdout = line_train > 1, line_test > 1, line_holdout > 1
    model_head = 'FOOT-LIVEBCDRAW'
elif target_type == 'binary_away':
    kf_col = 3
    kf_res = 0
    y_train, line_train, active_train  =  1 * (data_npz['y_train_multi'] == kf_res), data_npz['K_train'][:,kf_col], data_npz['K_train'][:,0] == 1
    y_test, line_test, active_test  =  1 * (data_npz['y_test_multi'] == kf_res), data_npz['K_test'][:,kf_col], data_npz['K_test'][:,0] == 1
    y_holdout, line_holdout, active_holdout = 1 * (data_npz['y_holdout_multi'] == kf_res), data_npz['K_holdout'][:,kf_col], data_npz['K_holdout'][:,0] == 1
    zero_train, zero_test, zero_holdout = line_train > 1, line_test > 1, line_holdout > 1
    model_head = 'FOOT-LIVEBCAWAY'
#---------------
process = psutil.Process(os.getpid())
print('Mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

Mem usage:  3.2 GiB


In [23]:
#dataset_name = './dataset.npz'
#data_npz = np.load(dataset_name)
#X_train, X_test = data_npz['X_train'], data_npz['X_test']
#y_train1, y_test1 = data_npz['y_train_regression1'], data_npz['y_test_regression1']
#y_train2, y_test2 = data_npz['y_train_regression2'], data_npz['y_test_regression2']

In [24]:
X_train.shape, X_test.shape, X_holdout.shape

((11197708, 41), (2798988, 41), (144925, 41))

In [25]:
y_train.shape, y_test.shape, y_holdout.shape

((11197708,), (2798988,), (144925,))

In [26]:
cols = [element for element in
params['features'].replace('[', '').replace(']','').replace(' ','').replace("'","").split(',')]

#### Download models and restore predicts

In [27]:
# Модель для 1-ой команды
model_num = 3 # Указываем номер модели
neptune_model = model_head
neptune_model_version = neptune_model + f'-{model_num}'
model_version_params = dict(
    project = 'scomesse/football',
    model = neptune_model,
    api_token = api_key,
    with_id = neptune_model_version
)
PATH_TO_MODEL = './booster.model'
model_version = neptune.init_model_version(**model_version_params)
model_version['model'].download(PATH_TO_MODEL)
params1 = model_version['team_parameters'].fetch()
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST1/v/FOOT-LIVEBST1-3
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST1/v/FOOT-LIVEBST1-3/metadata


In [28]:
if target_type == 'regression':
    # Модель для 2-ой команды
    model_num = 4 # Указываем номер модели
    neptune_model = model_head2
    neptune_model_version = neptune_model + f'-{model_num}'
    model_version_params2 = dict(
        project = 'scomesse/football',
        model = neptune_model,
        api_token = api_key,
        with_id = neptune_model_version
    )
    PATH_TO_MODEL = './booster2.model'
    model_version = neptune.init_model_version(**model_version_params2)
    model_version['model'].download(PATH_TO_MODEL)
    params2 = model_version['team_parameters'].fetch()
    model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-4
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-4/metadata


In [29]:
if target_type == 'regression':
    preds_dict = {}
    booster = CatBoost()
    booster.load_model('./booster.model')
    preds_dict['train'] = {1:booster.predict(X_train) * 21.0}
    preds_dict['test'] = {1:booster.predict(X_test) * 21.0}
    preds_dict['holdout'] = {1:booster.predict(X_holdout) * 21.0}

    booster2 = CatBoost()
    booster2.load_model('./booster2.model')
    preds_dict['train'].update({2:booster2.predict(X_train) * 21.0})
    preds_dict['test'].update({2:booster2.predict(X_test) * 21.0})
    preds_dict['holdout'].update({2:booster2.predict(X_holdout) * 21.0})

elif target_type == 'multiclass':
    booster = CatBoost()
    booster.load_model('./booster.model')
    train_preds_int = booster.predict(X_train, prediction_type="Class")
    test_preds_int = booster.predict(X_test, prediction_type="Class")
    holdout_preds_int = booster.predict(X_holdout, prediction_type="Class")
    print("Train Accuracy : % 4f"% eval_metric(y_train, train_preds_int, "Accuracy")[0])
    print("Test  Accuracy : %.4f"%eval_metric(y_test, test_preds_int, "Accuracy")[0])
    print("Holdout  Accuracy : %.4f"%eval_metric(y_holdout, holdout_preds_int, "Accuracy")[0])
    train_preds = np.flip(booster.predict(X_train, prediction_type="Probability"), axis = 1)
    test_preds = np.flip(booster.predict(X_test, prediction_type="Probability"), axis = 1)
    holdout_preds = np.flip(booster.predict(X_holdout, prediction_type="Probability"), axis = 1)
elif target_type == 'binary_home':
    booster = CatBoost()
    booster.load_model('./booster.model')
    train_preds = booster.predict(X_train, prediction_type="Probability")
    test_preds = booster.predict(X_test, prediction_type="Probability")
    holdout_preds = booster.predict(X_holdout, prediction_type="Probability")
    print("Train Accuracy : % 4f"% eval_metric(y_train, train_preds, "Accuracy")[0])
    print("Test  Accuracy : %.4f"%eval_metric(y_test, test_preds, "Accuracy")[0])
    print("Holdout  Accuracy : %.4f"%eval_metric(y_holdout, holdout_preds, "Accuracy")[0])
    train_preds = train_preds[:,1]
    test_preds = test_preds[:,1]
    holdout_preds = holdout_preds[:,1]
elif target_type == 'binary_draw':
    booster = CatBoost()
    booster.load_model('./booster.model')
    train_preds = booster.predict(X_train, prediction_type="Probability")
    test_preds = booster.predict(X_test, prediction_type="Probability")
    holdout_preds = booster.predict(X_holdout, prediction_type="Probability")
    print("Train Accuracy : % 4f"% eval_metric(y_train, train_preds, "Accuracy")[0])
    print("Test  Accuracy : %.4f"%eval_metric(y_test, test_preds, "Accuracy")[0])
    print("Holdout  Accuracy : %.4f"%eval_metric(y_holdout, holdout_preds, "Accuracy")[0])
    train_preds = train_preds[:,1]
    test_preds = test_preds[:,1]
    holdout_preds = holdout_preds[:,1]
elif target_type == 'binary_away':
    booster = CatBoost()
    booster.load_model('./booster.model')
    train_preds = booster.predict(X_train, prediction_type="Probability")
    test_preds = booster.predict(X_test, prediction_type="Probability")
    holdout_preds = booster.predict(X_holdout, prediction_type="Probability")
    print("Train Accuracy : % 4f"% eval_metric(y_train, train_preds, "Accuracy")[0])
    print("Test  Accuracy : %.4f"%eval_metric(y_test, test_preds, "Accuracy")[0])
    print("Holdout  Accuracy : %.4f"%eval_metric(y_holdout, holdout_preds, "Accuracy")[0])
    train_preds = train_preds[:,1]
    test_preds = test_preds[:,1]
    holdout_preds = holdout_preds[:,1]
#---------------
process = psutil.Process(os.getpid())
print('Mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

Mem usage:  3.49 GiB


In [30]:
active_train.shape,  line_train.shape

((11197708,), (11197708, 12))

In [31]:
active_rows_dict ={}
if 'binary' in target_type:
    active_rows_dict['train'] = (active_train & (line_train > 0))
    active_rows_dict['test'] = (active_test & (line_test > 0))
    active_rows_dict['holdout'] = (active_holdout & (line_holdout > 0))
else:
    active_rows_dict['train'] = (active_train & (line_train[:,:3].sum(axis = 1) > 0))
    active_rows_dict['test'] = (active_test & (line_test[:,:3].sum(axis = 1) > 0))
    active_rows_dict['holdout'] = (active_holdout & (line_holdout[:,:3].sum(axis = 1) > 0))

In [32]:
if target_type == 'regression':
    poisson_dict = {}
    for name in ['train', 'test', 'holdout']:
        poisson_dict[name] = {}
        poisson_dict[name][1] = {}
        poisson_dict[name][2] = {}
        for goal in range(7):
            poisson_dict[name][1][goal] = poisson.pmf(goal, preds_dict[name][1][active_rows_dict[name]])
            poisson_dict[name][2][goal] = poisson.pmf(goal, preds_dict[name][2][active_rows_dict[name]])

    # Считаем вероятности ничьих для разных стейтов
    draw_prob_matrix_dict = {}
    for name in ['train', 'test', 'holdout']:
        draw_prob_matrix_dict[name] = np.zeros((np.sum(active_rows_dict[name]), 13))
        for goal1 in range(7):
            for goal2 in range(7):
                draw_prob_matrix_dict[name][:, goal1 - goal2 + 6] = draw_prob_matrix_dict[name][:, goal1 - goal2 + 6] + \
                poisson_dict[name][1][goal2] * poisson_dict[name][2][goal1]

    # Считаем вероятности победы дома для разных стейтов 
    home_prob_matrix_dict = {}
    for name in ['train', 'test', 'holdout']:
        home_prob_matrix_dict[name] = np.hstack((
            np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1), 
            np.cumsum(draw_prob_matrix_dict[name], axis = 1)
                                            ))[:, :-1]

    # Считаем вероятности победы в гостях для разных стейтов
    away_prob_matrix_dict = {}
    for name in ['train', 'test', 'holdout']:
        away_prob_matrix_dict[name] = np.hstack((
            np.flip(np.cumsum(np.flip(draw_prob_matrix_dict[name], axis = 1), axis = 1), axis = 1), 
            np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1)
                                        ))[:, 1:]
                            
    current_state_dict = {}
    for name in ['train', 'test', 'holdout']:
        current_state_dict[name] = scores_dict['score'][name]['1'][active_rows_dict[name]] - \
                                scores_dict['score'][name]['2'][active_rows_dict[name]] + 6

    prob_dict = {}
    for name in ['train', 'test', 'holdout']:
        prob_dict[name] = np.hstack((
            np.take_along_axis(home_prob_matrix_dict[name], current_state_dict[name].reshape(-1, 1), axis = 1),
            np.take_along_axis(draw_prob_matrix_dict[name], current_state_dict[name].reshape(-1, 1), axis = 1),
            np.take_along_axis(away_prob_matrix_dict[name], current_state_dict[name].reshape(-1, 1), axis = 1)
        ))

In [80]:
# Считаем вероятности суммы забитых мячей
total_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    total_matrix_dict[name] = np.zeros((np.sum(active_rows_dict[name]), 13))
    for goal1 in range(7):
        for goal2 in range(7):
            total_matrix_dict[name][:, goal1 + goal2] = total_matrix_dict[name][:, goal1 + goal2] + \
            poisson_dict[name][1][goal2] * poisson_dict[name][2][goal1]

In [61]:
# Считаем текущей стэйт суммы забитых мячей
total_state_dict = {}
for name in ['train', 'test', 'holdout']:
    total_state_dict[name] = scores_dict['score'][name]['1'][active_rows_dict[name]] + \
                            scores_dict['score'][name]['2'][active_rows_dict[name]]

In [None]:
# Считаем вероятности забить не менее определенного количества мячей
over_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    over_matrix_dict[name] = np.hstack((
        np.flip(np.cumsum(np.flip(total_matrix_dict[name], axis = 1), axis = 1), axis = 1), 
        np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1)
                                    ))[:, 1:]

In [88]:
# Считаем вероятности забить не менее определенного количества мячей
over_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    over_matrix_dict[name] = np.hstack((
        np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1), 
        np.cumsum(total_matrix_dict[name][:,1:], axis = 1)
                                        ))[:, :-1]

In [None]:
# Считаем вероятности забить не более определенного количества
under_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    under_matrix_dict[name] = np.cumsum(total_matrix_dict[name], axis = 1)

In [None]:
# Считаем вероятности победы дома для разных стейтов 
under_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    under_matrix_dict[name] =

np.cumsum(under_matrix_dict[name], axis = 1)
                                        ))[:, :-1]

In [82]:
total_matrix_dict['holdout'][:,:5]

array([[0.06537365, 0.17831552, 0.24318992, 0.22111119, 0.15077772],
       [0.07403912, 0.1927358 , 0.25086123, 0.21767744, 0.14166239],
       [0.07737223, 0.19800539, 0.2533605 , 0.21612727, 0.1382743 ],
       ...,
       [0.1075199 , 0.2397779 , 0.26736188, 0.19874607, 0.11080488],
       [0.11156932, 0.2446837 , 0.26830903, 0.19614367, 0.10754112],
       [0.11707536, 0.25111933, 0.26931762, 0.19255648, 0.1032554 ]])

In [90]:
over_matrix_dict['holdout'][:,1:6]

array([[0.17831552, 0.42150543, 0.64261663, 0.79339434, 0.87564769],
       [0.1927358 , 0.44359703, 0.66127447, 0.80293686, 0.87669088],
       [0.19800539, 0.45136589, 0.66749316, 0.80576746, 0.87653977],
       ...,
       [0.2397779 , 0.50713978, 0.70588585, 0.81669072, 0.86611146],
       [0.2446837 , 0.51299273, 0.7091364 , 0.81667752, 0.8638474 ],
       [0.25111933, 0.52043695, 0.71299343, 0.81624883, 0.86054411]])

In [44]:
scores_dict['score']['holdout']['1'][active_rows_dict[name]] + \
scores_dict['score']['holdout']['2'][active_rows_dict[name]]

array([0, 0, 0, ..., 2, 2, 2], dtype=int8)

In [39]:
line_holdout[:,6][active_rows_dict[name]] - \
scores_dict['score']['holdout']['1'][active_rows_dict[name]] - \
scores_dict['score']['holdout']['2'][active_rows_dict[name]]

array([3. , 2.5, 2.5, ..., 2.5, 2.5, 2.5], dtype=float16)

In [37]:
scores_dict['score']['holdout']['2'][active_rows_dict[name]]

array([0, 0, 0, ..., 2, 2, 2], dtype=int8)

In [100]:
line_test[:, 6:9].shape

(2798988, 3)

In [104]:
pd.DataFrame(line_test[:,6][active_rows_dict['test']]).value_counts()

2.5     208582
2.0     156028
1.5     123033
3.0     122494
3.5      97906
4.0      43239
4.5      24536
1.0      13160
5.0       9544
5.5       5491
6.0       2218
6.5       1175
7.0        402
7.5        170
8.0         91
8.5         60
0.5         44
9.0          9
10.0         4
10.5         4
11.0         4
9.5          4
0.0          3
dtype: int64

In [49]:
y_holdout_regr1[active_rows_dict[name]] * 21 + \
y_holdout_regr2[active_rows_dict[name]] * 21 + \
scores_dict['score']['holdout']['1'][active_rows_dict[name]] + \
scores_dict['score']['holdout']['2'][active_rows_dict[name]]

array([2., 2., 2., ..., 2., 2., 2.])

In [51]:
 current_state_dict['holdout']

array([6, 6, 6, ..., 4, 4, 4], dtype=int8)

In [42]:
y_holdout_regr1[active_rows_dict[name]] * 21 + \
y_holdout_regr2[active_rows_dict[name]] * 21

array([2., 2., 2., ..., 0., 0., 0.])

In [35]:
line_holdout[:,6][active_rows_dict[name]] - scores_dict['score']['holdout']['1'][active_rows_dict[name]]  - scores_dict['score']['holdout']['2'][active_rows_dict[name]]

array([3. , 2.5, 2.5, ..., 2.5, 2.5, 2.5], dtype=float16)

In [36]:
 #current_state_dict['holdout']

In [37]:
create_table = False
if create_table:
    add_data_name = './additional_data.npz'
    add_data = np.load(add_data_name)
    id_train, id_test, id_holdout = add_data['id_train'], add_data['id_test'], add_data['id_holdout']
    min_train, min_test, min_holdout = add_data['min_train'], add_data['min_test'], add_data['min_holdout']
    df = pd.DataFrame(id_test[active_rows_dict['test']], columns = ['id'])
    df['min'] = (min_test[active_rows_dict['test']] * 50).astype(np.int8)
    df[['W1', 'WX', 'W2']] = line_test[active_rows_dict['test'], :3]
    df['result'] = 2 - y_test[active_rows_dict['test']]
    if target_type == 'regression':
        df[['PR1', 'PRX', 'PR2']] = prob_dict['test']
    elif target_type == 'multiclass':
        df[['MC1', 'MCX', 'MC2']] = test_preds[active_rows_dict['test']]
    elif target_type == 'binary_home':
        df['BC1'] = test_preds[active_rows_dict['test']]
    elif target_type == 'binary_draw':
        df['BCX'] = test_preds[active_rows_dict['test']]
    elif target_type == 'binary_away':
        df['BC2'] = test_preds[active_rows_dict['test']]
        df.to_csv('./probabilities.csv', index = False)
    #---------------
    process = psutil.Process(os.getpid())
    print('Mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
    #---------------

In [97]:
create_timeline = True
if create_timeline:
    time_df = pd.read_csv('./time.csv', index_col = 'Id', parse_dates = ['StatTime'])
    add_data_name = './additional_data.npz'
    add_data = np.load(add_data_name)
    id_test = add_data['id_test']
    time_df = time_df[time_df.index.isin(id_test)]
    time_dict = time_df.to_dict()
    timeline_test_array = np.sort([np.datetime64(time_dict['StatTime'][id]) for id in id_test])
    #---------------
    process = psutil.Process(os.getpid())
    print('Mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
    #---------------

Mem usage:  7.13 GiB


In [101]:
timeline_test_array.shape

(2798988,)

#### F1 score

##### F1 sore train

In [None]:
if target_type == 'multiclass':
    weighted = True
else:
    weighted = False
fig = plot_f1(
    y_train,
    train_preds[:,0],
    data_split = 'train',
    weighted = weighted
)
model_version = neptune.init_model_version(**model_version_params)
model_version['f1_train'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


##### F1 score test

In [None]:
if target_type == 'multiclass':
    weighted = True
else:
    weighted = False
fig = plot_f1(
    y_test,
    test_preds,
    data_split = 'test',
    weighted = weighted
)
model_version = neptune.init_model_version(**model_version_params)
model_version['f1_test'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


#### Confusion Multilabel Matrix

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_train, train_preds_int[:,0], data_split = 'train', threshold = 0.5)
elif target_type == 'regression':
    train_preds_int = calculate_multiclass(prob_dict['train'], line_train[active_rows_dict['train'],:3])['argmax']
    fig = plot_multi_matrix(y_train[active_rows_dict['train']], train_preds_int, data_split = 'train')
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_train_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_test, test_preds_int[:,0], data_split = 'test', threshold = 0.5)
elif target_type == 'regression':
    test_preds_int = calculate_multiclass(prob_dict['test'], line_test[active_rows_dict['test'],:3])['argmax']
    fig = plot_multi_matrix(y_test[active_rows_dict['test']], test_preds_int, data_split = 'test')

model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_test_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_holdout, holdout_preds_int[:,0], data_split = 'holdout', threshold = 0.5)
elif target_type == 'regression':
    holdout_preds_int = calculate_multiclass(prob_dict['holdout'], line_holdout[active_rows_dict['holdout'],:3])['argmax']
    fig = plot_multi_matrix(y_holdout[active_rows_dict['holdout']], holdout_preds_int, data_split = 'holdout')

model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_holdout_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
weight = 1.432

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_train, (train_preds * [1., weight, 1.]).argmax(axis = 1), data_split = 'train', threshold = weight)
    model_version = neptune.init_model_version(**model_version_params)
    model_version[f'confusion_matrix_train_{weight}'].upload(neptune.types.File.as_html(fig))
    model_version.stop()

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_test, (test_preds * [1., weight, 1.]).argmax(axis = 1), data_split = 'test', threshold = weight)

    model_version = neptune.init_model_version(**model_version_params)
    model_version[f'confusion_matrix_test_{weight}'].upload(neptune.types.File.as_html(fig))
    model_version.stop()

In [None]:
if target_type == 'multiclass':
    fig = plot_multi_matrix(y_holdout, (holdout_preds * [1., weight, 1.]).argmax(axis = 1), data_split = 'holdout', threshold = weight)

    model_version = neptune.init_model_version(**model_version_params)
    model_version[f'confusion_matrix_holdout_{weight}'].upload(neptune.types.File.as_html(fig))
    model_version.stop()

#### Confusion Matrix

In [None]:
treshold = 0.22

In [None]:
cfm_train = confusion_matrix(y_train, (train_preds > 0.5).round().astype(int))
fig = plot_confusion_matrix(cfm_train, data_split = 'train', threshold = 0.5)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_train_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
cfm_train = confusion_matrix(y_train, (train_preds > treshold).round().astype(int))
fig = plot_confusion_matrix(cfm_train, data_split = 'train', threshold = treshold)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_train_{treshold}'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
cfm_test = confusion_matrix(y_test, (test_preds > 0.5).round().astype(int))
fig = plot_confusion_matrix(cfm_test, data_split = 'test', threshold = 0.5)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_test_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
cfm_test = confusion_matrix(y_test, (test_preds > treshold).round().astype(int))
fig = plot_confusion_matrix(cfm_test, data_split = 'test', threshold = treshold)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_test_{treshold}'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
cfm_holdout = confusion_matrix(y_holdout, (holdout_preds > 0.5).round().astype(int))
fig = plot_confusion_matrix(cfm_holdout, data_split = 'holdout', threshold = 0.5)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_holdout_0.5'].upload(neptune.types.File.as_html(fig))
model_version.stop()

In [None]:
cfm_holdout = confusion_matrix(y_holdout, (holdout_preds > treshold).round().astype(int))
fig = plot_confusion_matrix(cfm_holdout, data_split = 'holdout', threshold = treshold)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'confusion_matrix_holdout_{treshold}'].upload(neptune.types.File.as_html(fig))
model_version.stop()

#### Profit validation

##### TRAIN Profit validation

In [None]:
if target_type == 'regression':
    train_preds_int, train_preds_float = calculate_multiclass(prob_dict['train'], line_train[active_rows_dict['train'],:3]).values()
    train_preds_float = train_preds_float[:,0]
    y_train_prof = ((2 - y_train[active_rows_dict['train']]) == train_preds_int) * 1
    Line_production = np.take_along_axis(line_train[active_rows_dict['train'],:3], train_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_train_prof, 
        train_preds_float / 10, 
        Line_production, 
        'train: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
elif target_type == 'multiclass':
    train_preds_int, train_preds_float = calculate_multiclass(train_preds[active_rows_dict['train']], line_train[active_rows_dict['train'],:3]).values()
    train_preds_float = train_preds_float[:,0]
    y_train_prof = ((2 - y_train[active_rows_dict['train']]) == train_preds_int) * 1
    Line_production = np.take_along_axis(line_train[active_rows_dict['train'],:3], train_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_train_prof, 
        train_preds_float / 10, 
        Line_production, 
        'train: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
else:
    fig = get_profit_validation(
        y_train[zero_train & active_train], 
        train_preds[zero_train & active_train], 
        line_train[zero_train & active_train], 
        neptune_model + f'-{model_num}'
        )
    fig.show()

In [135]:
fig.write_html(f'train: {neptune_model}-{model_num}_profit_report.html') #neptune_model + f'-{model_num}'
model_version = neptune.init_model_version(**model_version_params)
model_version[f'profit_validation_train'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


In [34]:
y_train[zero_train & active_train]

array([0, 0, 0, ..., 1, 1, 1])

In [None]:
if target_type == 'regression':    
    threshold = 0.104
    bet_type = 'divk'
    strategy = 'complex'
    fig = plot_equity(
        y_train_prof, 
        train_preds_float / 10,
        Line_production,
        threshold, neptune_model + f'-{model_num}',
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'train', 
        reverse_bet = False
        )
    fig.show()
elif target_type == 'multiclass':
    threshold = 0.057
    bet_type = 'divk'
    strategy = 'complex'
    fig = plot_equity(
        y_train_prof, 
        train_preds_float / 10,
        Line_production,
        threshold, neptune_model + f'-{model_num}',
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'train', 
        reverse_bet = False
        )
    fig.show()
else:
    threshold = 0.12
    bet_type = 'fixed'
    strategy = 'complex'
    fig = plot_equity(
        y_train[zero_train & active_train], 
        train_preds[zero_train & active_train],
        line_train[zero_train & active_train],
        threshold, neptune_model + f'-{model_num}',
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'train', 
        reverse_bet = False
        )
    fig.show()

In [146]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'equity_train_{bet_type}_{strategy}_th_{threshold}'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


##### TEST Profit validation

In [94]:
if target_type == 'regression':
    test_preds_int, test_preds_float = calculate_multiclass(prob_dict['test'], line_test[active_rows_dict['test'],:3]).values()
    test_preds_float = test_preds_float[:,0]
    y_test_prof = ((2 - y_test[active_rows_dict['test']]) == test_preds_int) * 1
    Line_production = np.take_along_axis(line_test[active_rows_dict['test'],:3], test_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_test_prof, 
        test_preds_float / 10, 
        Line_production, 
        'test: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
elif target_type == 'multiclass':
    test_preds_int, test_preds_float = calculate_multiclass(test_preds[active_rows_dict['test']], line_test[active_rows_dict['test'],:3]).values()
    test_preds_float = test_preds_float[:,0]
    y_test_prof = ((2 - y_test[active_rows_dict['test']]) == test_preds_int) * 1
    Line_production = np.take_along_axis(line_test[active_rows_dict['test'],:3], test_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_test_prof, 
        test_preds_float / 10, 
        Line_production, 
        'test: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
else:
    fig = get_profit_validation(
        y_test[zero_test & active_test], 
        test_preds[zero_test & active_test], 
        line_test[zero_test & active_test], 
        'test: ' + neptune_model + f'-{model_num}'
        )
    fig.show()


invalid value encountered in true_divide



In [153]:
fig.write_html(f'test: {neptune_model}-{model_num}_profit_report.html') #neptune_model + f'-{model_num}'
model_version = neptune.init_model_version(**model_version_params)
model_version[f'profit_validation_test'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


In [95]:
if (target_type == 'regression') | (target_type == 'multiclass'):
    timestr = '_timeline'
    threshold = 0.106
    bet_type = 'divk'
    strategy = 'simple'
    #fig = plot_equity(
    fig_timeline =  plot_equity_timeline(
        y_test_prof, 
        test_preds_float / 10,
        Line_production,
        #time_df['date'],
        threshold, neptune_model + f'-{model_num}',
        x_date = timeline_test_array[active_rows_dict['test']],
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'test', 
        reverse_bet = False
        )
    fig_timeline.show()
else:
    timestr = '_timeline'
    threshold = 0.108
    bet_type = 'divk' #'fixed' #'divk'
    strategy = 'complex' #'simple' #'complex'
    #fig =  plot_equity(
    fig_timeline =  plot_equity_timeline(
        y_test[active_rows_dict['test']], 
        test_preds[active_rows_dict['test']], 
        line_test[active_rows_dict['test']],
        threshold, neptune_model + f'-{model_num}',
        x_date = timeline_test_array[active_rows_dict['test']],
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'test', 
        reverse_bet = False
        )
    fig_timeline.show()

In [97]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'equity{timestr}_test_{bet_type}_{strategy}_th_{threshold}'].upload(neptune.types.File.as_html(fig_timeline))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBCAWAY/v/FOOT-LIVEBCAWAY-7
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBCAWAY/v/FOOT-LIVEBCAWAY-7/metadata


In [None]:
if target_type == 'regression':
    holdout_preds_int, holdout_preds_float = calculate_multiclass(prob_dict['holdout'], line_holdout[active_rows_dict['holdout'],:3]).values()
    holdout_preds_float = holdout_preds_float[:,0]
    y_holdout_prof = ((2 - y_holdout[active_rows_dict['holdout']]) == holdout_preds_int) * 1
    Line_production = np.take_along_axis(line_holdout[active_rows_dict['holdout'],:3], holdout_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_holdout_prof, 
        holdout_preds_float / 10, 
        Line_production, 
        'holdout: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
elif target_type == 'multiclass':
    holdout_preds_int, holdout_preds_float = calculate_multiclass(holdout_preds[active_rows_dict['holdout']], line_holdout[active_rows_dict['holdout'],:3]).values()
    holdout_preds_float = holdout_preds_float[:,0]
    y_holdout_prof = ((2 - y_holdout[active_rows_dict['holdout']]) == holdout_preds_int) * 1
    Line_production = np.take_along_axis(line_holdout[active_rows_dict['holdout'],:3], holdout_preds_int.reshape(-1,1), axis = 1)[:, 0]
    fig = get_profit_validation(
        y_holdout_prof, 
        holdout_preds_float / 10, 
        Line_production, 
        'holdout: ' + neptune_model + f'-{model_num}'
        )
    fig.show()
else:
    fig = get_profit_validation(
        y_holdout[zero_holdout & active_holdout], 
        holdout_preds[zero_holdout & active_holdout], 
        line_holdout[zero_holdout & active_holdout], 
        'holdout: ' + neptune_model + f'-{model_num}'
        )
    fig.show()

In [157]:
fig.write_html(f'holdout: {neptune_model}-{model_num}_profit_report.html') #neptune_model + f'-{model_num}'
model_version = neptune.init_model_version(**model_version_params)
model_version[f'profit_validation_holdout'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


In [None]:
np.sum(zero_holdout & active_holdout)

117272

In [None]:
if (target_type == 'regression') | (target_type == 'multiclass'):      
    threshold = 0.039
    bet_type = 'divk'
    strategy = 'complex'
    fig = plot_equity(
        y_holdout_prof, 
        holdout_preds_float / 10,
        Line_production,
        threshold, neptune_model + f'-{model_num}',
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'holdout', 
        reverse_bet = False
        )
    fig.show()
else:
    threshold = 0.101
    bet_type = 'divk'
    strategy = 'complex'
    fig = plot_equity(
        y_holdout[zero_holdout & active_holdout], 
        holdout_preds[zero_holdout & active_holdout], 
        line_holdout[zero_holdout & active_holdout], 
        threshold, neptune_model + f'-{model_num}',
        bet_type = bet_type, 
        strategy = strategy, 
        data_split = 'holdout', 
        reverse_bet = False
        )
    fig.show()

In [160]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'equity_holdout_{bet_type}_{strategy}_th_{threshold}'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-2/metadata


### Draft

In [None]:
active_rows = (active_test & (line_test.sum(axis = 1) > 0))

In [None]:
win_vector = y_test[active_rows] == test_preds[active_rows].argmax(axis = 1)

In [None]:
bet = 1 / np.take_along_axis(
    line_test[active_rows], 
    2 - test_preds[active_rows].argmax(axis = 1)[:, None], 
    axis = 1
    )

In [None]:
kf = np.take_along_axis(
    line_test[active_rows], 
    2 - test_preds[active_rows].argmax(axis = 1)[:, None], 
    axis = 1
    ) - 1

In [None]:
np.sum(bet[win_vector].astype(np.float32) * kf[win_vector].astype(np.float32)) - np.sum(bet[~win_vector].astype(np.float32))

-28413.11

In [None]:
weight = 1.432
bet_a = 1 / np.take_along_axis(
    line_test[active_rows], 
    2 - (test_preds * [1., weight, 1.])[active_rows].argmax(axis = 1)[:, None], 
    axis = 1
    )

In [None]:
kf_a = np.take_along_axis(
    line_test[active_rows], 
    2 - (test_preds * [1., weight, 1.])[active_rows].argmax(axis = 1)[:, None], 
    axis = 1
    ) - 1

In [None]:
np.sum(bet_a[win_vector].astype(np.float32) * kf_a[win_vector].astype(np.float32)) - np.sum(bet_a[~win_vector].astype(np.float32))

-11128.875

In [None]:
prob = test_preds[active_rows].max(axis = 1)
prob_a = (test_preds * [1., weight, 1.])[active_rows].max(axis = 1)

In [None]:
np.sum(bet[win_vector].astype(np.float32) * kf[win_vector].astype(np.float32) * prob[win_vector].astype(np.float32).reshape(-1, 1)) - \
np.sum(bet[~win_vector].astype(np.float32) * prob[~win_vector].astype(np.float32).reshape(-1, 1))

-16344.102

In [None]:
np.sum(bet_a[win_vector].astype(np.float32) * kf_a[win_vector].astype(np.float32) * prob_a[win_vector].astype(np.float32).reshape(-1, 1)) - \
np.sum(bet_a[~win_vector].astype(np.float32) * prob_a[~win_vector].astype(np.float32).reshape(-1, 1))

-8718.695

In [None]:
#model_version = neptune.init_model_version(**model_version_params)
#del model_version['equity_holdout_divk_complex_th_0.107']
#model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBCAWAY/v/FOOT-LIVEBCAWAY-7
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBCAWAY/v/FOOT-LIVEBCAWAY-7/metadata


In [None]:
#print("Test  RMSE : %.4f"%eval_metric(y_test, test_preds, "Accuracy")[0])
#print("Train RMSE : % 4f"% eval_metric(y_train1, train_preds1, "RMSE")[0])
#print("Test  R2 : %.4f"%eval_metric(y_test1, test_preds1, "R2")[0])
#print("Train R2 : % 4f"%eval_metric(y_train1, train_preds1, "R2")[0])

Test  RMSE : 0.7164


In [None]:
# Модель для 1-ой команды
model_num = 1 # Указываем номер модели
neptune_model = f'FOOT-LIVEBST2'
neptune_model_version = neptune_model + f'-{model_num}'
model_version_params = dict(
    project = 'scomesse/football',
    model = neptune_model,
    api_token = api_key,
    with_id = neptune_model_version
)
PATH_TO_MODEL = './booster_team2.model'
model_version = neptune.init_model_version(**model_version_params)
model_version['team2_model'].download(PATH_TO_MODEL)
params2 = model_version['team_parameters'].fetch()
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-1
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-1/metadata


In [None]:
booster_team2 = CatBoost()
booster_team2.load_model('./booster_team2.model')

<catboost.core.CatBoost at 0x7fbec4bf1790>

In [None]:
test_preds2 = booster_team2.predict(X_test)
train_preds2 = booster_team2.predict(X_train)

In [None]:
print("Test  RMSE : %.4f"%eval_metric(y_test2, test_preds2, "RMSE")[0])
print("Train RMSE : % 4f"% eval_metric(y_train2, train_preds2, "RMSE")[0])
print("Test  R2 : %.4f"%eval_metric(y_test2, test_preds2, "R2")[0])
print("Train R2 : % 4f"%eval_metric(y_train2, train_preds2, "R2")[0])

Test  RMSE : 0.0497
Train RMSE :  0.049284
Test  R2 : 0.1868
Train R2 :  0.193117


#### Load Curren Score & Final Results

In [None]:
score1_train, score2_train =  data_npz['score1_train'], data_npz['score2_train']
result1_train, result2_train =  data_npz['result1_train'], data_npz['result2_train']
score1_test, score2_test =  data_npz['score1_test'], data_npz['score2_test']
result1_test, result2_test =  data_npz['result1_test'], data_npz['result2_test']

In [None]:
sc1_test_array = np.vstack([poisson.pmf(score, mu = test_preds1 * 21, loc = 0) for score in range(7)]).T
sc2_test_array = np.vstack([poisson.pmf(score, mu = test_preds2 * 21, loc = 0) for score in range(7)]).T

In [None]:
prob_dict = {}
prob_dict[0] = np.sum((sc1_test_array * sc2_test_array), axis = 1)
for diff in range(1, 7):
    prob_dict[diff] = np.sum(sc1_test_array[:, diff:] * sc2_test_array[:, :-diff], axis = 1)
    prob_dict[-diff] = np.sum(sc1_test_array[:, :-diff] * sc2_test_array[:, diff:], axis = 1)

In [None]:
diff_prob_arr = np.hstack([prob_dict[6 - arr].reshape(-1, 1) for arr in range(13)])

In [None]:
curdiff = score1_test - score2_test
curdiff[curdiff > 6] = 6
curdiff[curdiff < -6] = -6
#curdiff = curdiff + 6

In [None]:
line_prob = np.zeros((curdiff.shape[0], 3))
for score_diff in range(13):
    line_prob[:,0] += diff_prob_arr[:,score_diff] * np.array([curdiff > -6 + score_diff])[0]
    line_prob[:,1] += diff_prob_arr[:,score_diff] * np.array([curdiff == -6 + score_diff])[0]
    line_prob[:,2] += diff_prob_arr[:,score_diff] * np.array([curdiff < -6 + score_diff])[0]

In [None]:
np.sum(line_prob[:, 0] > 0.5) / curdiff.shape[0]

0.3724400980120842

In [None]:
np.sum(np.argmax(line_prob, axis = 1) == 0), np.sum(np.argmax(line_prob, axis = 1) == 1), np.sum(np.argmax(line_prob, axis = 1) == 2)

(1555138, 90997, 930282)

In [None]:
np.sum(test_preds1 > test_preds2)

1670851

In [None]:
if target_type == 'regression':
    active_rows_holdout = (active_holdout & (line_holdout[:,:3].sum(axis = 1) > 0))
    # Считаем Пуассона для разного количества голов одной команды
    poisson_holdout_dict1 = {}
    poisson_holdout_dict2 = {}
    for goal in range(7):
        poisson_holdout_dict1[goal] = poisson.pmf(goal, holdout_preds[active_rows_holdout])
        poisson_holdout_dict2[goal] = poisson.pmf(goal, holdout_preds2[active_rows_holdout])

    # Считаем вероятности ничьих для разных стейтов
    holdout_draw_prob_matrix = np.zeros((np.sum(active_rows_holdout), 13))
    for goal1 in range(7):
        for goal2 in range(7):
            holdout_draw_prob_matrix[:, goal1 - goal2 + 6] = holdout_draw_prob_matrix[:, goal1 - goal2 + 6] + \
            poisson_holdout_dict1[goal2] * poisson_holdout_dict2[goal1]
    # Считаем вероятности победы дома для разных стейтов        
    holdout_home_prob_matrix = np.hstack((
        np.zeros(np.sum(active_rows_holdout)).reshape(-1,1), 
        np.cumsum(holdout_draw_prob_matrix, axis = 1)
                                        ))[:, :-1]
     # Считаем вероятности победы в гостях для разных стейтов
    holdout_away_prob_matrix = np.hstack((
        np.flip(np.cumsum(np.flip(holdout_draw_prob_matrix, axis = 1), axis = 1), axis = 1), 
        np.zeros(np.sum(active_rows_holdout)).reshape(-1,1)
                                    ))[:, 1:]

    current_state_holdout = scores_dict['score']['holdout']['1'][active_rows_holdout] - \
                            scores_dict['score']['holdout']['2'][active_rows_holdout] + 6

    holdout_prob = np.hstack((
        np.take_along_axis(holdout_home_prob_matrix, current_state_holdout.reshape(-1, 1), axis = 1),
        np.take_along_axis(holdout_draw_prob_matrix, current_state_holdout.reshape(-1, 1), axis = 1),
        np.take_along_axis(holdout_away_prob_matrix, current_state_holdout.reshape(-1, 1), axis = 1)
    ))
    prob_line_holdout = holdout_prob * line_holdout[active_rows_holdout,:3]
    #best odds (0 - home, 1 - draw, 2 - away)
    best_odd_result = np.argmax(prob_line_holdout, axis = 1)
    best_odd_float = np.take_along_axis(
    prob_line_holdout, 
    best_odd_result.reshape(-1,1), 
    axis = 1
    )