<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/catboost/football_live_prediction_catboost_cv_handicap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


[CatBoost - An In-Depth Guide [Python API]](https://coderzcolumn.com/tutorials/machine-learning/catboost-an-in-depth-guide-python#9)<br>
[Catboost](https://catboost.ai/en/docs/concepts/python-reference_pool)<br>
[Cross-Validation Techniques](https://medium.com/geekculture/cross-validation-techniques-33d389897878)<br>
[https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb](https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb)


### Project config

In [None]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune


  from neptune.version import version as neptune_client_version
  import neptune.new as neptune


In [None]:
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [None]:
!pip install catboost >> None
!pip install deap >> None

### Downloads

In [None]:
project = neptune.init_project(
    project="scomesse/football",
    api_token = api_key
    )

data_version = 'football_live_main_part_npz_230510/'
project[data_version + 'dataset'].download('./dataset.npz')
project[data_version + 'additional_data'].download('./additional_data.npz')
project[data_version + 'time'].download('./time.csv')
data_params = project[data_version + 'params'].fetch()

data_version = 'football_live_upd_230510/'
project[data_version + 'dataset'].download('./dataset_upd.npz')
project[data_version + 'additional_data'].download('./additional_data_upd.npz')
project[data_version + 'time'].download('./time_upd.csv')
data_params_upd = project[data_version + 'params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.5.3
1.22.4


In [None]:
from tqdm import tqdm
import plotly.express as px
import os, psutil, time
import gc

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import poisson
from neptune.utils import stringify_unsupported

In [None]:
from catboost import CatBoost
from catboost import utils
from catboost import CatBoostClassifier, CatBoostRegressor
from catboost import Pool, cv
from catboost.utils import eval_metric
np.random.seed(147)

In [None]:
import operator
import random
from deap import base
from deap import creator
from deap import tools
from deap import algorithms

### Code

#####Functions

In [None]:
def calculate_multiclass(probability_2x:np.array, line_2x:np.array):
    '''
    input : 0 - over, 1 - under
    output: 0 - over, 1 - under
    '''
    probability_adj = probability_2x / np.sum(probability_2x, axis = 1).reshape(-1, 1)
    probline_adj = probability_adj * line_2x
    probline = probability_2x * line_2x
    best_odd_result = np.argmax(probline_adj, axis = 1)
    best_odd_float_adj = np.take_along_axis(probline_adj, best_odd_result.reshape(-1, 1), axis = 1)
    best_odd_float = np.take_along_axis(probline, best_odd_result.reshape(-1, 1), axis = 1)
    return {
        'argmax':best_odd_result,
        'float':best_odd_float,
        'float_adj':best_odd_float_adj
    }

In [None]:
def get_profit_curve(prob_dict, final_goal_sum, Line_production, bet_type = 'fixed'):
    '''
    Line_production -> line['total', 'over', 'under']
    final_goal_sumv -> data_npz['result1'] + data_npz['result2']
    '''
    #fixed, divk, divk-1
    #simple, complex
    # считаем прдикты по исходу и вероятности умноженной на кэф (over, under)
    preds_int, preds_float, preds_float_adj = calculate_multiclass(prob_dict, Line_production[:,1:3]).values()
    preds_float_adj = preds_float_adj[:,0]
    preds_vec = preds_int * (2) - 1 # -1:over, 1:under
    # вектор результатов
    res_vec = ((final_goal_sum + Line_production[:, 0]) > 0) * (-1) + \
             ((final_goal_sum + Line_production[:, 0]) < 0) * (1)

    win_vec, lose_vec = (preds_vec * res_vec) == 1, (preds_vec * res_vec) == -1
    line_vec = np.take_along_axis(Line_production[:,1:3], preds_int.reshape(-1, 1), axis = 1)[:, 0]
    threshold = []
    profit = []
    bet_qty_list = []
    for th in np.linspace(0.5, 2.0,101):
        threshold.append(th)
        vector_th = (preds_float_adj) > th
        bet_qty_list.append(vector_th.sum())
        if bet_type == 'divk':
            #                       размер ставки                                   кэф
            profit.append(np.sum((1/line_vec[win_vec & vector_th].astype(np.float32)) * \
                                (line_vec[win_vec & vector_th] - 1).astype(np.float32)) -\
                                np.sum((1/line_vec[lose_vec & vector_th].astype(np.float32))))
        else:
            profit.append(np.sum((line_vec[win_vec & vector_th] - 1).astype(np.float32)) - np.sum(lose_vec & vector_th))

    return threshold, profit, bet_qty_list

In [None]:
def make_filter(hda, preds_int):
    hda_dict = {'home':0, 'away':1}
    if hda == 'all':
        return preds_int > -1
    else:
        return preds_int == hda_dict[hda]

In [None]:
def profit_validation_by_type(prob_dict, active_rows_dict, preds_dict, line_dict, scores_dict): #active_rows_dict, preds_dict, line_dict, y_dict):
    data_splits = ['holdout', 'test', 'train']
    hda_list = ['all', 'home', 'away']
    title_text = f'Profit & bet qty for validation model in neptune.ai: model_name'
    colors = ['rgb(93, 164, 214)', 'rgb(255, 144, 14)',  'rgb(44, 160, 101)', 'rgb(255, 65, 54)']
    fig = make_subplots(rows = 3, cols = 1,
                        shared_xaxes = True,
                        vertical_spacing = 0.02,
                        subplot_titles = ("holdout", "test", "train")
                        )

    for cnt_split, data_split in enumerate(data_splits):
        final_goal_sum = scores_dict['result'][data_split]['1'] - scores_dict['result'][data_split]['2']
        preds_int, _, _ = calculate_multiclass(
            prob_dict[data_split],
            line_dict[data_split][active_rows_dict[data_split]][:,1:3]
                                                ).values()
        for cnt_hda, hda in enumerate(hda_list):
            sfilter = make_filter(hda, preds_int)
            threshold, profit, bet_qty_list = get_profit_curve(
                                                prob_dict[data_split][sfilter],
                                                final_goal_sum[active_rows_dict[data_split]][sfilter],
                                                line_dict[data_split][active_rows_dict[data_split]][sfilter],
                                                bet_type = 'divk'
                                                )
            fig.append_trace(go.Scatter(
                                    x = threshold,
                                    y = profit,
                                    name = hda,
                                    line = dict(color = colors[cnt_hda], width = 4 - 3 * bool(cnt_hda))),
                                    row = cnt_split + 1, col=1)
            # Update yaxis properties
            fig.update_yaxes(
                   rangemode = 'nonnegative',
                row = cnt_split + 1, col = 1)

    return fig.update_layout(title = '<b>Handicap</b> profit validation', height = 800, width = 900)

##### Prepare Data

1. регрессия


In [None]:
id_time_df = pd.read_csv('/content/time_upd.csv', parse_dates = ['StatTime'], dayfirst = True)
ids_seq = id_time_df.sort_values(by = 'StatTime')['Id'].values[:int(len(id_time_df) / 2)]
additional_data_upd = np.load('/content/additional_data_upd.npz')
id_vector = np.isin(additional_data_upd['id'], ids_seq)

data_npz = np.load('./dataset.npz')
data_upd_npz = np.load('./dataset_upd.npz')
X = np.vstack((data_npz['X'], data_upd_npz['X'][id_vector]))
print(X.shape)


(14604604, 37)


In [None]:
#@title Выбор таргета
target_type = "regression2" #@param ["regression1", "regression2"]
if target_type == 'regression1':
    y = np.hstack((data_npz['y_regression1'], data_upd_npz['y_regression1'][id_vector]))
    reg_num = 1
elif target_type == 'regression2':
    y = np.hstack((data_npz['y_regression2'], data_upd_npz['y_regression2'][id_vector]))
    reg_num = 2
print(X.shape, y.shape)
#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

(14604604, 37) (14604604,)
mem usage:  2.4 GiB


In [None]:
!mkdir -p ./models

##### Folding

In [None]:
kfold_splits = 3
kfold_size = int(y.shape[0] / kfold_splits)
kfold_num = 0
kfold_start, kfold_end = kfold_size * kfold_num, kfold_size * (kfold_num + 1)
kfold_vector = np.zeros(y.shape[0], dtype = np.bool_)
kfold_vector[kfold_start:kfold_end] = True

In [None]:
train_data = Pool(X[~kfold_vector], y[~kfold_vector])
test_data = Pool(X[kfold_vector], y[kfold_vector])
#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

mem usage:  6.45 GiB


In [None]:
gpu_cnt = utils.get_gpu_device_count()
print("Number of GPU Count : ",gpu_cnt)

Number of GPU Count :  1


##### Train catboost model

In [None]:
cparams = {
    'loss_function':'RMSE',
    'learning_rate':0.3,
    'l2_leaf_reg':0.9,
    'max_depth':8,
    'min_data_in_leaf':12,
    #'leaf_estimation_method':leafmethod,
    #'early_stopping_rounds':15,
    'verbose':10,
    'iterations':350,
    #"eval_metric" : 'TotalF1',
    'task_type':"GPU",
    'devices':'0'
}

In [None]:
booster = CatBoostRegressor(
                **cparams
                )

booster.fit(train_data, eval_set = test_data)

booster.save_model(f'./models/booster_reg{reg_num}_{kfold_num}.model')

0:	learn: 0.0522288	test: 0.0533151	best: 0.0533151 (0)	total: 240ms	remaining: 1m 23s
10:	learn: 0.0494051	test: 0.0504136	best: 0.0504136 (10)	total: 1.63s	remaining: 50.1s
20:	learn: 0.0492479	test: 0.0502750	best: 0.0502750 (20)	total: 2.71s	remaining: 42.4s
30:	learn: 0.0491465	test: 0.0502342	best: 0.0502342 (30)	total: 3.8s	remaining: 39.1s
40:	learn: 0.0490639	test: 0.0502096	best: 0.0502096 (40)	total: 4.87s	remaining: 36.7s
50:	learn: 0.0489906	test: 0.0501968	best: 0.0501968 (50)	total: 5.95s	remaining: 34.9s
60:	learn: 0.0489360	test: 0.0501936	best: 0.0501920 (59)	total: 7.04s	remaining: 33.4s
70:	learn: 0.0488793	test: 0.0501983	best: 0.0501917 (61)	total: 8.1s	remaining: 31.8s
80:	learn: 0.0488223	test: 0.0502056	best: 0.0501917 (61)	total: 9.17s	remaining: 30.4s
90:	learn: 0.0487647	test: 0.0502091	best: 0.0501917 (61)	total: 10.2s	remaining: 29.2s
100:	learn: 0.0487134	test: 0.0502194	best: 0.0501917 (61)	total: 11.4s	remaining: 28.1s
110:	learn: 0.0486599	test: 0.0502

In [None]:
del train_data
del test_data
gc.collect()

0

In [None]:
#!tar -zcvf ./models.tar.gz ./models/*.*
!tar -xzvf ./models/models.tar.gz
#!cp ./booster.model ./models/booster_reg1_0.model
#!cp ./booster2.model ./models/booster_reg2_0.model
#!rm ./models/*

./models/booster_reg1_0.model
./models/booster_reg1_1.model
./models/booster_reg1_2.model
./models/booster_reg2_0.model
./models/booster_reg2_1.model
./models/booster_reg2_2.model


In [None]:
preds_dict = {'train':{}, 'test':{}, 'holdout':{}}
#kfold_num = 1
div_num = 3
for reg_num in range(1, 3):
    preds_dict['train'][reg_num] = CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(X) * 21

    preds_dict['test'][reg_num] = CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(data_upd_npz['X'][~id_vector]) * 21

    preds_dict['holdout'][reg_num] = CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(data_upd_npz['X'][~id_vector]) * 21

#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

mem usage:  6.48 GiB


In [None]:
preds_dict = {'train':{}, 'test':{}, 'holdout':{}}
#kfold_num = 1
div_num = 3
for reg_num in range(1, 3):
    preds_dict['train'][reg_num] = sum(
            CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(X) * 21
            for kfold_num in range(0, 3)
                                ) / div_num #kfold_splits
    preds_dict['test'][reg_num] = sum(
            CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(data_upd_npz['X'][~id_vector]) * 21
            for kfold_num in range(0, 3)
                                ) / div_num #kfold_splits
    preds_dict['holdout'][reg_num] = sum(
            CatBoost().load_model(
                f'./models/booster_reg{reg_num}_{kfold_num}.model'
                    ).predict(data_upd_npz['X'][~id_vector]) * 21
            for kfold_num in range(0, 3)
                                ) / div_num #kfold_splits

#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

mem usage:  4.94 GiB


In [None]:
preds_dict['holdout']

{1: array([1.20753088, 1.17642642, 1.16575435, ..., 0.52568328, 0.51120993,
        0.48787503]),
 2: array([1.38474816, 1.40526863, 1.36047703, ..., 1.31576274, 1.29699041,
        1.23460783])}

In [None]:
K = np.vstack((data_npz['K_train'][:,10:13], data_upd_npz['K_train'][:,10:13][id_vector]))
line_dict = {'train':K, 'test':data_upd_npz['K_train'][:, 10:13][~id_vector], 'holdout':data_upd_npz['K_train'][:, 10:13][~id_vector]}
#y_dict = {'train':y, 'test':data_upd_npz['y_multi'][~id_vector], 'holdout':data_upd_npz['y_multi'][~id_vector]}
X_dict = {'train':X, 'test':data_upd_npz['X'][~id_vector], 'holdout':data_upd_npz['X'][~id_vector]}

A = np.hstack((data_npz['K_train'][:,0], data_upd_npz['K_train'][:,0][id_vector]))
active_rows_dict ={}
active_rows_dict['train'] = ((A == 1) & (K[:,1:3].sum(axis = 1) > 2))
active_rows_dict['holdout'] = ((data_upd_npz['K_train'][:, 0][~id_vector] == 1) & (data_upd_npz['K_train'][:, 11:13][~id_vector].sum(axis = 1) > 2))
active_rows_dict['test'] = ((data_upd_npz['K_train'][:, 0][~id_vector] == 1) & (data_upd_npz['K_train'][:, 11:13][~id_vector].sum(axis = 1) > 2))

#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

mem usage:  8.57 GiB


#### Calculate probabilities

In [None]:
# Считаем Пуассона
poisson_dict = {}
for name in ['train', 'test', 'holdout']:
    poisson_dict[name] = {}
    poisson_dict[name][1] = {}
    poisson_dict[name][2] = {}
    for goal in range(7):
        poisson_dict[name][1][goal] = poisson.pmf(goal, preds_dict[name][1][active_rows_dict[name]])
        poisson_dict[name][2][goal] = poisson.pmf(goal, preds_dict[name][2][active_rows_dict[name]])

# Считаем вероятности разности забитых мячей
total_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    total_matrix_dict[name] = np.zeros((np.sum(active_rows_dict[name]), 13))
    for goal1 in range(7):
        for goal2 in range(7):
            total_matrix_dict[name][:, goal1 - goal2 + 6] = total_matrix_dict[name][:, goal1 - goal2 + 6] + \
            poisson_dict[name][1][goal1] * poisson_dict[name][2][goal2]

# Считаем вероятности победы дома over = home_win
over_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    #over_matrix_dict[name] = np.cumsum(np.flip(total_matrix_dict[name], axis = 1), axis = 1)[:,:-1]
    over_matrix_dict[name] = np.hstack((
        np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1),
        np.cumsum(np.flip(total_matrix_dict[name], axis = 1), axis = 1)
                                    ))#[:, 1:]

# Считаем вероятности победы гостей under = away_win
under_matrix_dict = {}
for name in ['train', 'test', 'holdout']:
    #under_matrix_dict[name] = np.flip(np.cumsum(total_matrix_dict[name], axis = 1), axis = 1)[:,1:]
    under_matrix_dict[name] = np.hstack((
        np.flip(np.cumsum(total_matrix_dict[name], axis = 1), axis = 1),
        np.zeros(np.sum(active_rows_dict[name])).reshape(-1,1)
                                        ))#[:, :-1]

# создаём словарь результатов
scores_list = ['score', 'result']
team_num_list = ['1', '2']
scores_dict = {'score':{'train':{}, 'test':{}, 'holdout':{}},
               'result':{'train':{}, 'test':{}, 'holdout':{}}}
for key_type in scores_list:
    for team_num in team_num_list:
        scores_dict[key_type]['train'][team_num] = np.hstack((
                                        data_npz[key_type + team_num],
                                        data_upd_npz[key_type + team_num][id_vector]
                                                            ))
        scores_dict[key_type]['test'][team_num] = data_upd_npz[key_type + team_num][~id_vector]
        scores_dict[key_type]['holdout'][team_num] = data_upd_npz[key_type + team_num][~id_vector]

# Считаем текущей стэйт суммы забитых мячей
total_state_dict = {}
total_state_dict['train'] = line_dict['train'][:,0][active_rows_dict['train']] + \
                            scores_dict['score']['train']['1'][active_rows_dict['train']] - \
                            scores_dict['score']['train']['2'][active_rows_dict['train']] + 6
total_state_dict['train'][total_state_dict['train'] < 0] = 0
total_state_dict['train'][total_state_dict['train'] > 12] = 12
total_state_dict['test'] =  line_dict['test'][:,0][active_rows_dict['test']] + \
                            scores_dict['score']['test']['1'][active_rows_dict['test']] - \
                            scores_dict['score']['test']['2'][active_rows_dict['test']] + 6
total_state_dict['test'][total_state_dict['test'] < 0] = 0
total_state_dict['test'][total_state_dict['test'] > 12] = 12
total_state_dict['holdout'] = line_dict['holdout'][:,0][active_rows_dict['holdout']] + \
                            scores_dict['score']['holdout']['1'][active_rows_dict['holdout']] - \
                            scores_dict['score']['holdout']['2'][active_rows_dict['holdout']] + 6
total_state_dict['holdout'][total_state_dict['holdout'] < 0] = 0
total_state_dict['holdout'][total_state_dict['holdout'] > 12] = 12

prob_dict = {} #clmn - 0:over(home_win) ||| clmn - 1:under (away_win)
for name in ['train', 'test', 'holdout']:
    prob_dict[name] = np.hstack((
        np.take_along_axis(over_matrix_dict[name], (total_state_dict[name].reshape(-1, 1) + 0.5).reshape(-1, 1).astype(np.int8), axis = 1),
        np.take_along_axis(under_matrix_dict[name], (total_state_dict[name].reshape(-1, 1) + 1).astype(np.int8), axis = 1),
    ))

#### Visualization

In [None]:
fig = profit_validation_by_type(prob_dict, active_rows_dict, preds_dict, line_dict, scores_dict)
fig.show()

In [None]:
description_dict = {
    'kfold_splits':kfold_splits,
    'data_version':'football_live_main_part_npz_230510/',
    'data_version_upd':'football_live_upd_230510/'
}

In [None]:
reg_num = 2
model_version = neptune.init_model_version(
    model = f'FOOT-LIVEBST{reg_num}',
    project = 'scomesse/football',
    api_token = api_key # your credentials
)
model_sys = model_version['sys'].fetch()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-6


In [None]:
print(model_sys)

{'creation_time': datetime.datetime(2023, 5, 19, 14, 58, 59, 601000, tzinfo=tzlocal()), 'id': 'FOOT-LIVEBST2-6', 'model_id': 'FOOT-LIVEBST2', 'modification_time': datetime.datetime(2023, 5, 19, 14, 58, 59, 601000, tzinfo=tzlocal()), 'monitoring_time': 0, 'owner': 'scomesse', 'ping_time': datetime.datetime(2023, 5, 19, 14, 58, 59, 601000, tzinfo=tzlocal()), 'running_time': 0.0, 'size': 0.0, 'stage': 'none', 'state': 'running', 'trashed': False}


In [None]:
model_version_params = dict(
    project = 'scomesse/football',
    model = model_sys['model_id'],
    api_token = api_key,
    with_id = model_sys['id']
)
model_version = neptune.init_model_version(**model_version_params)
for kfold_num in range(kfold_splits):
    model_version[f'/models/model_reg{reg_num}_{kfold_num}'].upload(f'./models/booster_reg{reg_num}_{kfold_num}.model')
model_version[f'/model_reg{reg_num}_parameters'] = stringify_unsupported(cparams)
model_version[f'/model_reg{reg_num}_description'] = stringify_unsupported(description_dict)
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-5
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 15 operations to synchronize with Neptune. Do not kill this process.
All 15 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-5/metadata


In [None]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'threshold_model_all_fold'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-5
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEBST2/v/FOOT-LIVEBST2-5/metadata
