<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_live_prediction_catboost_cv_heft_14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


[CatBoost - An In-Depth Guide [Python API]](https://coderzcolumn.com/tutorials/machine-learning/catboost-an-in-depth-guide-python#9)<br>
[Catboost](https://catboost.ai/en/docs/concepts/python-reference_pool)<br>
[Cross-Validation Techniques](https://medium.com/geekculture/cross-validation-techniques-33d389897878)<br>
[https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb](https://github.com/catboost/tutorials/blob/master/cross_validation/cv_tutorial.ipynb)


### Project config

In [None]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune


  from neptune.version import version as neptune_client_version
  import neptune.new as neptune


In [None]:
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [None]:
!pip install catboost >> None
!pip install deap >> None

### Downloads

In [None]:
project = neptune.init_project(
    project="scomesse/football", 
    api_token = api_key
    )

data_version = 'football_live_main_part_npz_230510/'
project[data_version + 'dataset'].download('./dataset.npz')
project[data_version + 'additional_data'].download('./additional_data.npz')
project[data_version + 'time'].download('./time.csv')
data_params = project[data_version + 'params'].fetch()

data_version = 'football_live_upd_230510/'
project[data_version + 'dataset'].download('./dataset_upd.npz')
project[data_version + 'additional_data'].download('./additional_data_upd.npz')
project[data_version + 'time'].download('./time_upd.csv')
data_params_upd = project[data_version + 'params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.5.3
1.22.4


In [None]:
from tqdm import tqdm
import plotly.express as px
import os, psutil, time
import gc

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from neptune.utils import stringify_unsupported

In [None]:
from catboost import CatBoost
from catboost import utils
from catboost import CatBoostClassifier, CatBoostRegressor
from catboost import Pool, cv
from catboost.utils import eval_metric
np.random.seed(147)

In [None]:
import operator
import random
from deap import base
from deap import creator
from deap import tools
from deap import algorithms

### Code

#####Functions

In [None]:
def calculate_multiclass(probability_2x:np.array, line_2x:np.array):

    probline = probability_2x * line_2x
    best_odd_result = np.argmax(probline, axis = 1)
    best_odd_float = np.take_along_axis(probline, best_odd_result.reshape(-1, 1), axis = 1)
    return {
        'argmax':best_odd_result,
        'float':best_odd_float[:,0],
    }

In [None]:
def precalculate_validation(data_split, active_rows_dict, preds_dict, line_dict, y_dict):
    preds_int, preds_float = calculate_multiclass(
        preds_dict[data_split][active_rows_dict[data_split]],
        line_dict[data_split][active_rows_dict[data_split]]
                                                ).values()
    y_prof = (y_dict[data_split][active_rows_dict[data_split]] == preds_int) * 1
    Line_production = np.take_along_axis(line_dict[data_split][active_rows_dict[data_split]], preds_int.reshape(-1,1), axis = 1)[:, 0]
    return y_prof, preds_int, preds_float, Line_production

In [None]:
def make_filter(hda, preds_int):
    hda_dict = {'home':0, 'draw':1, 'away':2}
    if hda == 'all':
        return preds_int > -1
    else:
        return preds_int == hda_dict[hda]


In [None]:
def get_profit_curve(y, y_pred, Line_production, bet_type = 'fixed', strategy = 'simple'):
    #fixed, divk, divk-1
    #simple, complex
    threshold = []
    profit = []
    bet_qty_list = []
    if bet_type == 'divk':
        profit_size = (Line_production - 1) / Line_production
        bet_size = Line_production
    else:
        profit_size = (Line_production - 1)
        bet_size = Line_production /Line_production
    for th in np.linspace(0.9,1.4,1001):
        threshold.append(th)           
        if strategy == 'simple':
            vector_th = y_pred > th
        if strategy == 'complex':
            vector_th = (y_pred * Line_production / 10) > th
        bet_qty_list.append(vector_th.sum())
        profit.append((y[vector_th] * profit_size[vector_th] + (y[vector_th] - 1) / bet_size[vector_th]).sum())

    return threshold, profit, bet_qty_list

In [None]:
def profit_validation_by_type(active_rows_dict, preds_dict, line_dict, y_dict):
    data_splits = ['holdout', 'test', 'train']
    hda_list = ['all', 'home', 'draw', 'away']
    title_text = f'Profit & bet qty for validation model in neptune.ai: model_name'
    colors = ['rgb(93, 164, 214)', 'rgb(255, 144, 14)',  'rgb(44, 160, 101)', 'rgb(255, 65, 54)']
    fig = make_subplots(rows = 3, cols = 1,
                        shared_xaxes = True, 
                        vertical_spacing = 0.02,
                        subplot_titles = ("holdout", "test", "train")
                        )
    for cnt_split, data_split in enumerate(data_splits):
        y_prof, preds_int, preds_float, Line_production = precalculate_validation(
                        data_split, active_rows_dict, preds_dict, line_dict, y_dict
                        )
        for cnt_hda, hda in enumerate(hda_list):
            sfilter = make_filter(hda, preds_int)
            threshold, profit, bet_qty_list = get_profit_curve(
                                                y_prof[sfilter], 
                                                preds_float[sfilter], 
                                                Line_production[sfilter],
                                                bet_type = 'divk'
                                                            )
            fig.append_trace(go.Scatter(
                                    x = threshold,
                                    y = profit,
                                    name = hda,
                                    line = dict(color = colors[cnt_hda], width = 4 - 3 * bool(cnt_hda))),
                                    row = cnt_split + 1, col=1)
            # Update yaxis properties
            fig.update_yaxes(rangemode = 'nonnegative', row = cnt_split + 1, col = 1)

    return fig.update_layout(title = 'profit validation', height = 800, width = 900)

##### Prepare Data

1. регрессия
2. мультиклассовая класификация {AWAY:2,DRAW:1, HOME:0} 
3. бинарная классификация: <br>
    a. HOME vs (DRAW & AWAY)<br>
    б. DRAW vs (HOME & AWAY)<br>
    в. AWAY vs (HOME & DRAW)<br>

In [None]:
#2010: 145536
#2021: 78600
#2022: 114949
#2023: 27456

In [None]:
id_time_df = pd.read_csv('/content/time_upd.csv', parse_dates = ['StatTime'], dayfirst = True)
ids_seq = id_time_df.sort_values(by = 'StatTime')['Id'].values[:int(len(id_time_df) / 2)]
additional_data_upd = np.load('/content/additional_data_upd.npz')
id_vector = np.isin(additional_data_upd['id'], ids_seq)

data_npz = np.load('./dataset.npz')
data_upd_npz = np.load('./dataset_upd.npz')
X = np.vstack((data_npz['X'], data_upd_npz['X'][id_vector]))
print(X.shape)


(14604604, 37)


In [None]:
#@title Выбор таргета
target_type = "multiclass" #@param ["regression1", "regression2", "multiclass", "binary_home", "binary_draw", "binary_away"]
if target_type == 'regression1':
    y = np.hstack((data_npz['y_regression1'], data_upd_npz['y_regression1'][id_vector]))
elif target_type == 'regression2':
    y = np.hstack((data_npz['y_regression2'], data_upd_npz['y_regression2'][id_vector]))
elif target_type == 'multiclass':
    y = np.hstack((data_npz['y_multi'], data_upd_npz['y_multi'][id_vector]))
elif target_type == 'binary_home':
    y =  1 * (np.hstack((data_npz['y_multi'], data_upd_npz['y_multi'][id_vector])) == 0)
elif target_type == 'binary_draw':
    y =  1 * (np.hstack((data_npz['y_multi'], data_upd_npz['y_multi'][id_vector])) == 1)
elif target_type == 'binary_away':
    y =  1 * (np.hstack((data_npz['y_multi'], data_upd_npz['y_multi'][id_vector])) == 2)

print(X.shape, y.shape)
#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

(14604604, 37) (14604604,)
mem usage:  2.39 GiB


In [None]:
!mkdir -p ./models

##### Folding

In [None]:
kfold_splits = 3
kfold_size = int(y.shape[0] / kfold_splits)
kfold_num = 2
kfold_start, kfold_end = kfold_size * kfold_num, kfold_size * (kfold_num + 1)
kfold_vector = np.zeros(y.shape[0], dtype = np.bool_)
kfold_vector[kfold_start:kfold_end] = True

In [None]:
train_data = Pool(X[~kfold_vector], y[~kfold_vector])
test_data = Pool(X[kfold_vector], y[kfold_vector])
#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

mem usage:  5.18 GiB


In [None]:
gpu_cnt = utils.get_gpu_device_count()
print("Number of GPU Count : ",gpu_cnt)

Number of GPU Count :  1


##### Train catboost model

In [None]:
cparams = {
    'loss_function': 'MultiClass',
    'learning_rate':0.02,
    'l2_leaf_reg':0.9,
    'max_depth':4,
    'min_data_in_leaf':15,
    #'leaf_estimation_method':leafmethod,
    #'early_stopping_rounds':15,
    'verbose':10,
    'iterations':3600,
    "eval_metric" : 'TotalF1',
    'task_type':"GPU", 
    'devices':'0'   
}

In [None]:
booster = CatBoostClassifier(
                **cparams        
                )

booster.fit(train_data, eval_set = test_data)

booster.save_model(f'./models/booster_{kfold_num}.model')

In [None]:
del train_data
del test_data
gc.collect()

0

In [None]:
booster = CatBoost()
booster.load_model(f'./models/booster_{kfold_num}.model')
print(f'./models/booster_{kfold_num}.model')
#booster.get_best_score()['validation']['TotalF1']

./models/booster_1.model


In [None]:
K = np.vstack((data_npz['K_train'][:,:4], data_upd_npz['K_train'][:,:4][id_vector]))
line_dict = {'train':K[:,1:4][~kfold_vector], 'test':K[:,1:4][kfold_vector], 'holdout':data_upd_npz['K_train'][:, 1:4][~id_vector]}
y_dict = {'train':y[~kfold_vector], 'test':y[kfold_vector], 'holdout':data_upd_npz['y_multi'][~id_vector]}
X_dict = {'train':X[~kfold_vector], 'test':X[kfold_vector], 'holdout':data_upd_npz['X'][~id_vector]}

active_rows_dict ={}
active_rows_dict['train'] = ((K[:,0][~kfold_vector] == 1) & (K[:,1:4][~kfold_vector].sum(axis = 1) > 3))
active_rows_dict['test'] = ((K[:,0][kfold_vector] == 1) & (K[:,1:4][kfold_vector].sum(axis = 1) > 3))
active_rows_dict['holdout'] = ((data_upd_npz['K_train'][:, 0][~id_vector] == 1) & (data_upd_npz['K_train'][:, 1:4][~id_vector].sum(axis = 1) > 3))

preds_dict = {}

preds_dict['train'] = booster.predict(X[~kfold_vector], prediction_type="Probability")
preds_dict['test'] = booster.predict(X[kfold_vector], prediction_type="Probability")
preds_dict['holdout'] = booster.predict(data_upd_npz['X'][~id_vector], prediction_type="Probability")

#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes 
#---------------

mem usage:  9.32 GiB


In [None]:
fig = profit_validation_by_type(active_rows_dict, preds_dict, line_dict, y_dict)
fig.show()

In [None]:
fts = ['Id', 'Score1', 'Score2', 'Active.1', 'W1', 'WX', 'W2', 'X1', 'X2',
       'W12', 'TotalValue', 'Over', 'Under', 'Hand1Value', 'H1', 'H2',
       'Result1', 'Result2', 'min_norm', 'Score1_norm', 'Score2_norm',
       'Score_diff', 'Score_cat_1', 'Score_cat_2', 'Score_cat_3',
       'Score_cat_4', 'Score_cat_5', 'Score_cat_6', 'Score_cat_7',
            'A2perMIN', 'A1relativ', 'A2relativ', 'DA1_scaled', 'DA2_scaled',
       'DA1perMIN', 'DA  'Score_cat_8', 'Score_cat_9', 'A1_scaled', 'A2_scaled', 'A1perMIN',
2perMIN', 'DA1relativ', 'DA2relativ', 'Pos1_cleaned',
       'Pos2_cleaned', 'Off1_norm', 'Off2_norm', 'On1_norm', 'On2_norm',
       'YC1_transformed', 'YC2_transformed', 'RC1_transformed',
       'RC2_transformed', 'Sub1_transformed', 'Sub2_transformed',
       'Cor1_transformed', 'Cor2_transformed', 'P1_transformed',
       'P2_transformed'][18:]
for col in ['A1relativ', 'A2relativ', 'DA1relativ', 'DA2relativ']:  
    fts.remove(col)

In [None]:
importance_dict = {key:value for value, key in sorted(zip(booster.get_feature_importance(), fts), reverse = False)}
fig1 = px.bar(
    pd.DataFrame(importance_dict.items(), columns = ['features', 'value']),
    x = 'value',
    y = 'features',
    orientation = 'h',
    #title = 'feature importance multiclass',
    title = 'feature importance binary classification away',
    #title = 'feature importance regression team 2',
    width = 600,
    height = 800
)
fig1.show()

In [None]:
description_dict = {
    'kfold_splits':kfold_splits,
    'data_version':'football_live_main_part_npz_230510/',
    'data_version_upd':'football_live_upd_230510/'
}

In [None]:
model_version = neptune.init_model_version(
    model = 'FOOT-LIVEMC',
    project = 'scomesse/football',
    api_token = api_key # your credentials
)
model_sys = model_version['sys'].fetch()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4


In [None]:
print(model_sys)

{'creation_time': datetime.datetime(2023, 5, 13, 16, 58, 58, 995000, tzinfo=tzlocal()), 'id': 'FOOT-LIVEMC-4', 'model_id': 'FOOT-LIVEMC', 'modification_time': datetime.datetime(2023, 5, 13, 16, 58, 58, 995000, tzinfo=tzlocal()), 'monitoring_time': 0, 'owner': 'scomesse', 'ping_time': datetime.datetime(2023, 5, 13, 16, 58, 58, 995000, tzinfo=tzlocal()), 'running_time': 0.0, 'size': 0.0, 'stage': 'none', 'state': 'running', 'trashed': False}


In [None]:
model_sys = {
    'model_id':'FOOT-LIVEMC',
    'id':'FOOT-LIVEMC-4'
}

In [None]:
model_version_params = dict(
    project = 'scomesse/football',
    model = model_sys['model_id'],
    api_token = api_key,
    with_id = model_sys['id']
)
model_version = neptune.init_model_version(**model_version_params)
model_version[f'/models/model_{kfold_num}'].upload(f'./models/booster_{kfold_num}.model')
model_version[f'/models/model_{kfold_num}_parameters'] = stringify_unsupported(cparams)
model_version[f'/models/model_{kfold_num}_description'] = stringify_unsupported(description_dict)
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 14 operations to synchronize with Neptune. Do not kill this process.
All 14 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4/metadata


In [None]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'threshold_model_{kfold_num}_fold'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4/metadata


In [None]:
model_version = neptune.init_model_version(**model_version_params)
model_version[f'importance_model_{kfold_num}_fold'].upload(neptune.types.File.as_html(fig1))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-LIVEMC/v/FOOT-LIVEMC-4/metadata
