<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_live_multiclass_folding_validation_heft_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


[CatBoost - An In-Depth Guide [Python API]](https://coderzcolumn.com/tutorials/machine-learning/catboost-an-in-depth-guide-python#9)<br>
[Catboost](https://catboost.ai/en/docs/concepts/python-reference_pool)<br>
[Cross-Validation Techniques](https://medium.com/geekculture/cross-validation-techniques-33d389897878)<br>
[https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb](https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb)


### Project config

In [1]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune


  from neptune.version import version as neptune_client_version
  import neptune.new as neptune


In [2]:
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [3]:
!pip install catboost >> None
!pip install deap >> None

### Downloads

In [4]:
project = neptune.init_project(
    project="scomesse/football", 
    api_token = api_key
    )

data_version = 'football_live_main_part_npz_230510/'
project[data_version + 'dataset'].download('./dataset.npz')
project[data_version + 'additional_data'].download('./additional_data.npz')
project[data_version + 'time'].download('./time.csv')
data_params = project[data_version + 'params'].fetch()

data_version = 'football_live_upd_230510/'
project[data_version + 'dataset'].download('./dataset_upd.npz')
project[data_version + 'additional_data'].download('./additional_data_upd.npz')
project[data_version + 'time'].download('./time_upd.csv')
data_params_upd = project[data_version + 'params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [5]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.5.3
1.22.4


In [6]:
from tqdm import tqdm
import os, psutil, time
import gc

In [7]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from neptune.utils import stringify_unsupported

In [8]:
from catboost import CatBoost
from catboost import utils
from catboost import CatBoostClassifier, CatBoostRegressor
from catboost import Pool, cv
from catboost.utils import eval_metric
np.random.seed(147)

In [9]:
import operator
import random
from deap import base
from deap import creator
from deap import tools
from deap import algorithms

### Code

#####Functions

In [63]:
def calculate_multiclass(probability_2x:np.array, line_2x:np.array):
    probline = probability_2x * line_2x
    best_odd_result = np.argmax(probline, axis = 1)
    best_odd_float = np.take_along_axis(probline, best_odd_result.reshape(-1, 1), axis = 1)
    return {
        'argmax':best_odd_result,
        'float':best_odd_float[:,0],
    }

In [46]:
def precalculate_validation(data_split, active_rows_dict, preds_dict, line_dict, y_dict):
    preds_int, preds_float = calculate_multiclass(
        preds_dict[data_split][active_rows_dict[data_split]],
        line_dict[data_split][active_rows_dict[data_split]]
                                                ).values()
    y_prof = (y_dict[data_split][active_rows_dict[data_split]] == preds_int) * 1
    Line_production = np.take_along_axis(line_dict[data_split][active_rows_dict[data_split]], preds_int.reshape(-1,1), axis = 1)[:, 0]
    return y_prof, preds_int, preds_float, Line_production

In [47]:
def make_filter(hda, preds_int):
    hda_dict = {'home':0, 'draw':1, 'away':2}
    if hda == 'all':
        return preds_int > -1
    else:
        return preds_int == hda_dict[hda]


In [48]:
def get_profit_curve(y, y_pred, Line_production, bet_type = 'fixed', strategy = 'simple'):
    #fixed, divk, divk-1
    #simple, complex
    threshold = []
    profit = []
    bet_qty_list = []
    if bet_type == 'divk':
        profit_size = (Line_production - 1) / Line_production
        bet_size = Line_production
    else:
        profit_size = (Line_production - 1)
        bet_size = Line_production /Line_production
    for th in np.linspace(0.9,1.4,1001):
        threshold.append(th)           
        if strategy == 'simple':
            vector_th = y_pred > th
        if strategy == 'complex':
            vector_th = (y_pred * Line_production / 10) > th
        bet_qty_list.append(vector_th.sum())
        profit.append((y[vector_th] * profit_size[vector_th] + (y[vector_th] - 1) / bet_size[vector_th]).sum())

    return threshold, profit, bet_qty_list

In [49]:
def profit_validation_by_type(active_rows_dict, preds_dict, line_dict, y_dict):
    data_splits = ['holdout', 'test', 'train']
    hda_list = ['all', 'home', 'draw', 'away']
    title_text = f'Profit & bet qty for validation model in neptune.ai: model_name'
    colors = ['rgb(93, 164, 214)', 'rgb(255, 144, 14)',  'rgb(44, 160, 101)', 'rgb(255, 65, 54)']
    fig = make_subplots(rows = 3, cols = 1,
                        shared_xaxes = True, 
                        vertical_spacing = 0.02,
                        subplot_titles = ("holdout", "test", "train")
                        )
    for cnt_split, data_split in enumerate(data_splits):
        y_prof, preds_int, preds_float, Line_production = precalculate_validation(
                        data_split, active_rows_dict, preds_dict, line_dict, y_dict
                        )
        for cnt_hda, hda in enumerate(hda_list):
            sfilter = make_filter(hda, preds_int)
            threshold, profit, bet_qty_list = get_profit_curve(
                                                y_prof[sfilter], 
                                                preds_float[sfilter], 
                                                Line_production[sfilter],
                                                bet_type = 'divk'
                                                            )
            fig.append_trace(go.Scatter(
                                    x = threshold,
                                    y = profit,
                                    name = hda,
                                    line = dict(color = colors[cnt_hda], width = 4 - 3 * bool(cnt_hda))),
                                    row = cnt_split + 1, col=1)
            # Update yaxis properties
            fig.update_yaxes(rangemode = 'nonnegative', row = cnt_split + 1, col = 1)

    return fig.update_layout(title = 'profit validation', height = 800, width = 900)

##### Prepare Data

1. регрессия
2. мультиклассовая класификация {AWAY:2,DRAW:1, HOME:0} 
3. бинарная классификация: <br>
    a. HOME vs (DRAW & AWAY)<br>
    б. DRAW vs (HOME & AWAY)<br>
    в. AWAY vs (HOME & DRAW)<br>

In [10]:
#2010: 145536
#2021: 78600
#2022: 114949
#2023: 27456

In [11]:
# распределяем id матчей на holdout и не holdout
id_time_df = pd.read_csv('/content/time_upd.csv', parse_dates = ['StatTime'], dayfirst = True)
ids_seq = id_time_df.sort_values(by = 'StatTime')['Id'].values[:int(len(id_time_df) / 2)]
additional_data_upd = np.load('/content/additional_data_upd.npz')
id_vector = np.isin(additional_data_upd['id'], ids_seq)

# собираем трейн main dataset + часть перемнного
data_npz = np.load('./dataset.npz')
data_upd_npz = np.load('./dataset_upd.npz')
X = np.vstack((data_npz['X'], data_upd_npz['X'][id_vector]))
print(X.shape)

(14604604, 37)


In [12]:
target_type = "multiclass"
y = np.hstack((data_npz['y_multi'], data_upd_npz['y_multi'][id_vector]))
print(X.shape, y.shape)
#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

(14604604, 37) (14604604,)
mem usage:  2.4 GiB


In [13]:
!mkdir -p ./models

In [14]:
# Модель для 1-ой команды
model_num = 4 # Указываем номер модели
neptune_model_version = 'FOOT-LIVEMC' + f'-{model_num}'
model_version_params = dict(
    project = 'scomesse/football',
    model = 'FOOT-LIVEMC',
    api_token = api_key,
    with_id = neptune_model_version
)
model_version = neptune.init_model_version(**model_version_params)
for kfold_num in range(3):
    model_version[f'/models/model_{kfold_num}'].download(f'./models/booster_{kfold_num}.model')
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4/metadata


In [15]:
preds_dict = {}
for kfold_num in range(3):
    preds_dict[kfold_num] = {}
    booster = CatBoost()
    booster.load_model(f'./models/booster_{kfold_num}.model')
    preds_dict[kfold_num]['train'] = booster.predict(X, prediction_type="Probability")
    preds_dict[kfold_num]['holdout'] = booster.predict(data_upd_npz['X'][~id_vector], prediction_type="Probability")

In [44]:
preds_dict = {}
preds_dict['train'] = sum(
    CatBoost().load_model(
        f'./models/booster_{kfold_num}.model'
            ).predict(
                X, 
                prediction_type="Probability"
                )
    for kfold_num in range(3)
                        ) / 3
preds_dict['holdout'] = sum(
    CatBoost().load_model(
        f'./models/booster_{kfold_num}.model'
            ).predict(
                data_upd_npz['X'][~id_vector], 
                prediction_type="Probability"
                )
    for kfold_num in range(3)
                        ) / 3
preds_dict['test'] = sum(
    CatBoost().load_model(
        f'./models/booster_{kfold_num}.model'
            ).predict(
                data_upd_npz['X'][~id_vector], 
                prediction_type="Probability"
                )
    for kfold_num in range(3)
                        ) / 3

In [54]:
K = np.vstack((data_npz['K_train'][:,:4], data_upd_npz['K_train'][:,:4][id_vector]))
line_dict = {'train':K[:, 1:4], 'test':data_upd_npz['K_train'][:, 1:4][~id_vector], 'holdout':data_upd_npz['K_train'][:, 1:4][~id_vector]}
y_dict = {'train':y, 'test':data_upd_npz['y_multi'][~id_vector], 'holdout':data_upd_npz['y_multi'][~id_vector]}
X_dict = {'train':X, 'test':data_upd_npz['X'][~id_vector], 'holdout':data_upd_npz['X'][~id_vector]}

active_rows_dict ={}
active_rows_dict['train'] = ((K[:,0] == 1) & (K[:,1:4].sum(axis = 1) > 3))
active_rows_dict['holdout'] = ((data_upd_npz['K_train'][:, 0][~id_vector] == 1) & (data_upd_npz['K_train'][:, 1:4][~id_vector].sum(axis = 1) > 3))
active_rows_dict['test'] = ((data_upd_npz['K_train'][:, 0][~id_vector] == 1) & (data_upd_npz['K_train'][:, 1:4][~id_vector].sum(axis = 1) > 3))

#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

mem usage:  6.71 GiB


In [62]:
preds_dict['train'].shape

(14604604, 3)

In [64]:
fig = profit_validation_by_type(active_rows_dict, preds_dict, line_dict, y_dict)
fig.show()

(534433, 3) (534433, 3)
(534433, 3) (534433, 3)
(4513975, 3) (4513975, 3)
