<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/EDA/football_Dataset_preparation_for_boosts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Project config

In [1]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [2]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

### Downloads

In [4]:
data_version = 'football_live_221229/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'data.rar'].download('./data.rar')
project[data_version + 'info.rar'].download('./info.rar')
project[data_version + 'prem.rar'].download('./prem.rar')
project[data_version + 'train_id.csv'].download('./train_id.csv')
project[data_version + 'test_id.csv'].download('./test_id.csv')
project[data_version + 'holdout_id.csv'].download('./holdout_id.csv')
project[data_version + 'data_train.csv.gz'].download('./data_train.csv.gz')
project[data_version + 'data_test.csv.gz'].download('./data_test.csv.gz')
project[data_version + 'data_hold.csv.gz'].download('./data_hold.csv.gz')

#del project[data_version]
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [3]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

#import dask.dataframe as dd
import subprocess
import sys
from glob import glob
from tqdm import tqdm

1.3.5
1.21.6


In [4]:
import plotly.express as px

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

### Code

#####Functions

In [6]:
def run_bash(bashCommand:str, nameCommand = ''):
        process = subprocess.Popen([bashCommand], 
                           shell=True)
        _, error = process.communicate()
        if error:
            print(f'{nameCommand} error:\n', error)

In [7]:
# Присоединяем итоговый результат и по первому тайму
def add_match_results (data_df:pd.DataFrame, cols = ['Id', 'Result1', 'Result2', 'Periods'], info_path = './info.csv'):
    info_df = pd.read_csv(info_path, sep = ';', usecols = cols)
    info_df[['Period1', 'Period2', 'Period3', 'Period4']] = info_df['Periods'].str.split(',', expand = True)
    info_df[['Time1Res1', 'Time1Res2']] = info_df['Period1'].str.split(':', expand = True)
    info_df = info_df[~info_df['Id'].duplicated(keep = False)]
    period1_result_dict = info_df.set_index('Id')[['Time1Res1', 'Time1Res2', 'Result1', 'Result2']].to_dict(orient = 'index')
    #data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] =  \
    return [[period1_result_dict[id]['Time1Res1'], period1_result_dict[id]['Time1Res1'], 
    period1_result_dict[id]['Result1'], period1_result_dict[id]['Result2']] 
    if id in period1_result_dict else [None, None, None, None] for id in tqdm(data_df['Id'].values, total = len(data_df))]
    

In [8]:
# Добавляем прематчевые линии
def add_match_lines (data_df:pd.DataFrame, cols = ['P1', 'PX', 'P2', 'PR'], prem_path = './prem.csv'):
    prem_df = pd.read_csv(prem_path, sep = ';')
    prem_df = prem_df[~prem_df['Id'].duplicated(keep = False)]
    prem_dict = prem_df.set_index('Id')[cols].to_dict(orient = 'index')
    #data_df.loc[:,['P1', 'PX', 'P2', 'PR']] =  \
    return [[prem_dict[id]['P1'], prem_dict[id]['PX'], prem_dict[id]['P2'], prem_dict[id]['PR']] 
    if id in prem_dict else [None, None, None, None] for id in tqdm(data_df['Id'].values, total = len(data_df))]

In [12]:
def transform_dataset(data_df:pd.DataFrame, remain = []):

    # трансформируем минуты
    data_df['min_norm'] = data_df['Minute'].astype(np.float32) / 50
    print('1. Минуты посчитаны...')
    # трансформируем голы
    data_df['Score1_norm'] = data_df['Score1'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score1'] > 3, ['Score1_norm']] = 1.0
    data_df['Score2_norm'] = data_df['Score2'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score2'] > 3, ['Score2_norm']] = 1.0

    data_df['Score_diff'] = data_df['Score1'].astype(np.int16) - data_df['Score2'].astype(np.int16)
    data_df.loc[data_df['Score_diff'] < -4, ['Score_diff']] = -4
    data_df.loc[data_df['Score_diff'] > 4, ['Score_diff']] = 4
    data_df[[f'Score_cat_{n}' for n in range(1, 10)]] = pd.get_dummies(data_df['Score_diff']).values
    data_df['Score_diff'] = data_df['Score_diff'].astype(np.float32) / np.float32(4.0)
    if 'Score1' not in remain:
        data_df = data_df.drop(['Score1', 'Score2'], axis = 1)
    print('2. Голы посчитаны...')
    #трансформируем атаки
    data_df['A1_scaled'] = data_df['A1'].astype(np.float32) / 75
    data_df.loc[data_df['A1'] >= 60, ['A1_scaled']] = (60 + (data_df['A1'] - 60) / 4) / 75
    data_df['A2_scaled'] = data_df['A2'].astype(np.float32) / 75
    data_df.loc[data_df['A2'] >= 60, ['A2_scaled']] = (60 + (data_df['A2'] - 60) / 4) / 75
    # атаки в минуту
    data_df['A1perMIN'] = data_df['A1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A1perMIN'] > 4, ['A1perMIN']] = np.float32(4.0)
    data_df['A2perMIN'] = data_df['A2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A2perMIN'] > 4, ['A2perMIN']] = np.float32(4.0)
    # динамика атак
    data_df['A1relativ'] = data_df['A1'].astype(np.float32) - data_df['A1'].shift(5).astype(np.float32)
    data_df.loc[data_df['A1relativ'] < 0, ['A1relativ']] = np.float32(0.0)
    data_df['A1relativ'] = data_df['A1relativ'].fillna(0)
    data_df.loc[data_df['A1relativ'] > 15, ['A1relativ']] = np.float32(15.)
    data_df['A2relativ'] = data_df['A2'].astype(np.float32) - data_df['A2'].shift(5).astype(np.float32)
    data_df.loc[data_df['A2relativ'] < 0, ['A2relativ']] =  np.float32(0.0)
    data_df['A2relativ'] = data_df['A2relativ'].fillna(0)
    data_df.loc[data_df['A2relativ'] > 15, ['A2relativ']] = np.float32(15.)
    if 'A1' not in remain:
        data_df = data_df.drop(['A1', 'A2'], axis = 1)
    print('3. Атаки посчитаны...')
    # трансформируем опасные атаки
    data_df['DA1_scaled'] = data_df['DA1'].astype(np.float32) / 50
    data_df.loc[data_df['DA1'] >= 40, ['DA1_scaled']] = (80 + (data_df['DA1'] - 40) / 3) / 100
    data_df['DA2_scaled'] = data_df['DA2'].astype(np.float32) / 50
    data_df.loc[data_df['DA2'] >= 40, ['DA2_scaled']] = (80 + (data_df['DA2'] - 40) / 3) / 100
    # опасные атаки в минуту    
    data_df['DA1perMIN'] = data_df['DA1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA1perMIN'] > 3, ['DA1perMIN']] = np.float32(3.0)
    data_df['DA2perMIN'] = data_df['DA2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA2perMIN'] > 3, ['DA2perMIN']] = np.float32(3.0)
    # динамика опасных атак
    data_df['DA1relativ'] = data_df['DA1'].astype(np.float32) - data_df['DA1'].shift(5).astype(np.float32)
    data_df.loc[data_df['DA1relativ'] < 0, ['DA1relativ']] = np.float32(0.0)
    data_df['DA1relativ'] = data_df['DA1relativ'].fillna(0)
    data_df.loc[data_df['DA1relativ'] > 10, ['DA1relativ']] = np.float32(10.)
    data_df['DA2relativ'] = data_df['DA2'].astype(np.float32) - data_df['DA2'].shift(5).astype(np.float32)
    data_df.loc[data_df['DA2relativ'] < 0, ['DA2relativ']] = np.float32(0.0)
    data_df['DA2relativ'] = data_df['DA2relativ'].fillna(0)
    data_df.loc[data_df['DA2relativ'] > 10, ['DA2relativ']] = np.float32(10.)
    if 'DA1' not in remain:
        data_df = data_df.drop(['DA1', 'DA2'], axis = 1)
    if 'Minute' not in remain:
        data_df = data_df.drop(['Minute'], axis = 1)
    print('4. Опасные атаки посчитаны...')
    # трансформируем владение мячом
    data_df['Pos1_cleaned'] = data_df['Pos1'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos1_cleaned'] < 0.2, ['Pos1_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos1_cleaned'] > 0.8, ['Pos1_cleaned']] = np.float32(0.8)
    data_df['Pos2_cleaned'] = data_df['Pos2'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos2_cleaned'] < 0.2, ['Pos2_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos2_cleaned'] > 0.8, ['Pos2_cleaned']] = np.float32(0.8)
    if 'Pos1' not in remain:
        data_df = data_df.drop(['Pos1', 'Pos2'], axis = 1)
    print('5. Владение мячом посчитпно...')
    # трансформируем удары
    data_df['Off1_norm'] = data_df['Off1'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off1_norm'] > 1.0, ['Off1_norm']] = np.float32(1.0)
    data_df['Off2_norm'] = data_df['Off2'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off2_norm'] > 1.0, ['Off2_norm']] = np.float32(1.0)
    if 'Off1' not in remain:
        data_df = data_df.drop(['Off1', 'Off2'], axis = 1)
    print('6. Удары посчитаны...')
    # трансформируем удары в створ
    data_df['On1_norm'] = data_df['On1'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On1_norm'] > 1.0, ['On1_norm']] = np.float32(1.0)
    data_df['On2_norm'] = data_df['On2'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On2_norm'] > 1.0, ['On2_norm']] = np.float32(1.0)
    if 'On1' not in remain:
        data_df = data_df.drop(['On1', 'On2'], axis = 1)    
    print('7. Удары в створ посчитаны...')
    # трансформируем желтые карточки
    data_df['YC1_transformed'] = data_df['YC1'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC1_transformed'] > 1.0, ['YC1_transformed']] = np.float32(1.0)
    data_df['YC2_transformed'] = data_df['YC2'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC2_transformed'] > 1.0, ['YC2_transformed']] = np.float32(1.0)
    if 'YC1' not in remain:
        data_df = data_df.drop(['YC1', 'YC2'], axis = 1)
    print('8. Жёлтые карточки посчитаны...')
    # трансформируем красные карточки
    data_df['RC1_transformed'] = data_df['RC1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC1_transformed'] > 1, ['RC1_transformed']] = np.int8(1)
    data_df['RC2_transformed'] = data_df['RC2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC2_transformed'] > 1, ['RC2_transformed']] = np.int8(1)
    if 'RC1' not in remain:
        data_df = data_df.drop(['RC1', 'RC2'], axis = 1)
    print('9. Красные карточки посчитаны...')
    # трансформируем замены
    data_df['Sub1_transformed'] = data_df['Sub1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub1_transformed'] > 1, ['Sub1_transformed']] = np.int8(1)
    data_df['Sub2_transformed'] = data_df['Sub2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub2_transformed'] > 1, ['Sub2_transformed']] = np.int8(1)
    if 'Sub1' not in remain:
        data_df = data_df.drop(['Sub1', 'Sub2'], axis = 1)
    print('10. Замены посчитаны...')
    # трансформируем угловык
    data_df['Cor1_transformed'] = data_df['Cor1'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor1_transformed'] > 1.0, ['Cor1_transformed']] = np.float32(1.0)
    data_df['Cor2_transformed'] = data_df['Cor2'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor2_transformed'] > 1.0, ['Cor2_transformed']] = np.float32(1.0)
    if 'Cor1' not in remain:
        data_df = data_df.drop(['Cor1', 'Cor2'], axis = 1)
    print('11. Угловые посчитаны...')
    # трансформируем линию
    data_df['P1_transformed'] = np.log(data_df['P1'], dtype = np.float32) / 2
    data_df['P2_transformed'] = np.log(data_df['P2'], dtype = np.float32) / 2
    if 'P1' not in remain:
        data_df = data_df.drop(['P1', 'P2'], axis = 1)
    print('12. Линии посчитаны...')
    if 'Pen1' not in remain:
        data_df = data_df.drop(['Pen1', 'Pen2'], axis = 1)
    if 'Active' not in remain:
        data_df = data_df.drop(['Active'], axis = 1)
    return data_df

#### Unpack

In [9]:
for file in glob('./*'):    
    if '.rar' in file:
        print(file)
        bash = f'unrar e {file} && rm {file}'
        run_bash(bash)

./prem.rar
./info.rar
./data.rar


### Preprocessing & Feature engineering

#### Test set

In [9]:
data_df = pd.read_csv(
    './data_test.csv.gz', 
    dtype = {
        'Id':np.int32, 'Minute':np.int16, 'Active':np.int8,
        'Score1':np.int16, 'Score2':np.int16, 'A1':np.int16, 'A2':np.int16,
        'DA1':np.int16, 'DA2':np.int16,'Pos1':np.float32, 'Pos2':np.float32,
        'Off1':np.int16, 'Off2':np.int16,'On1':np.int16, 'On2':np.int16,
        'YC1':np.float32, 'YC2':np.float32, 'RC1':np.float32, 'RC2':np.float32,
        'Sub1':np.float32, 'Sub2':np.float32, 'Pen1':np.float32, 'Pen2':np.float32,
        'Cor1':np.float32, 'Cor2':np.float32,
        }
    )

In [10]:
## Добавляем финальный результатк тесту
data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] = add_match_results (data_df)
isdigit = [str(a) for a in range(20)]
data_df.loc[~data_df['Time1Res1'].isin(isdigit), ['Time1Res1']] = -1
data_df.loc[~data_df['Time1Res2'].isin(isdigit), ['Time1Res2']] = -1
data_df[['Time1Res1', 'Time1Res2']] = data_df[['Time1Res1', 'Time1Res2']].astype(np.int16)
data_df[['Result1', 'Result2']] = data_df[['Result1', 'Result2']].astype(np.int16)
data_df = data_df.drop(['Time1Res1', 'Time1Res2'], axis = 1)

100%|██████████| 2598283/2598283 [00:29<00:00, 86647.81it/s] 


In [11]:
## Добавляем линии к тесту
data_df.loc[:,['P1', 'PX', 'P2', 'PR']] = add_match_lines (data_df)
data_df[['P1', 'PX', 'P2', 'PR']] = data_df[['P1', 'PX', 'P2', 'PR']].astype(np.float32)
data_df = data_df.drop(['PX', 'PR'], axis = 1)

100%|██████████| 2598283/2598283 [00:21<00:00, 118852.31it/s]


In [13]:
data_df = transform_dataset(data_df) 
    #remain = 
    #    ['Id', 'Minute', 'Active', 'Score1', 'Score2', 'A1', 'A2', 'DA1', 'DA2',
    #   'Pos1', 'Pos2', 'Off1', 'Off2', 'On1', 'On2', 'YC1', 'YC2', 'RC1',
    #   'RC2', 'Sub1', 'Sub2', 'Pen1', 'Pen2', 'Cor1', 'Cor2', 'P1', 'P2']
    #   )

1. Минуты посчитаны...
2. Голы посчитаны...
3. Атаки посчитаны...
4. Опасные атаки посчитаны...
5. Владение мячом посчитпно...
6. Удары посчитаны...
7. Удары в створ посчитаны...
8. Жёлтые карточки посчитаны...
9. Красные карточки посчитаны...
10. Замены посчитаны...
11. Угловые посчитаны...
12. Линии посчитаны...


In [14]:
test_df = data_df.copy(deep = True)

#### Train set

In [15]:
data_df = pd.read_csv(
    './data_train.csv.gz', 
    dtype = {
        'Id':np.int32, 'Minute':np.int16, 'Active':np.int8,
        'Score1':np.int16, 'Score2':np.int16, 'A1':np.int16, 'A2':np.int16,
        'DA1':np.int16, 'DA2':np.int16,'Pos1':np.float32, 'Pos2':np.float32,
        'Off1':np.int16, 'Off2':np.int16,'On1':np.int16, 'On2':np.int16,
        'YC1':np.float32, 'YC2':np.float32, 'RC1':np.float32, 'RC2':np.float32,
        'Sub1':np.float32, 'Sub2':np.float32, 'Pen1':np.float32, 'Pen2':np.float32,
        'Cor1':np.float32, 'Cor2':np.float32,
        }
    )

In [16]:
data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] = add_match_results (data_df)
isdigit = [str(a) for a in range(20)]
data_df.loc[~data_df['Time1Res1'].isin(isdigit), ['Time1Res1']] = -1
data_df.loc[~data_df['Time1Res2'].isin(isdigit), ['Time1Res2']] = -1
data_df[['Time1Res1', 'Time1Res2']] = data_df[['Time1Res1', 'Time1Res2']].astype(np.int16)
data_df[['Result1', 'Result2']] = data_df[['Result1', 'Result2']].astype(np.int16)
data_df = data_df.drop(['Time1Res1', 'Time1Res2'], axis = 1)

100%|██████████| 10392040/10392040 [01:32<00:00, 112636.89it/s]


In [17]:
data_df.loc[:,['P1', 'PX', 'P2', 'PR']] = add_match_lines (data_df)
data_df[['P1', 'PX', 'P2', 'PR']] = data_df[['P1', 'PX', 'P2', 'PR']].astype(np.float32)
data_df = data_df.drop(['PX', 'PR'], axis = 1)

100%|██████████| 10392040/10392040 [01:25<00:00, 121072.90it/s]


In [18]:
if 1898989 in data_df['Id']:
    data_df = data_df[data_df['Id'] != 1898989]

In [19]:
data_df = transform_dataset(data_df)
    #data_df, 
    #remain = 
    #    ['Id', 'Minute', 'Active', 'Score1', 'Score2', 'A1', 'A2', 'DA1', 'DA2',
    #   'Pos1', 'Pos2', 'Off1', 'Off2', 'On1', 'On2', 'YC1', 'YC2', 'RC1',
    #   'RC2', 'Sub1', 'Sub2', 'Pen1', 'Pen2', 'Cor1', 'Cor2', 'P1', 'P2']
    #   )

1. Минуты посчитаны...
2. Голы посчитаны...
3. Атаки посчитаны...
4. Опасные атаки посчитаны...
5. Владение мячом посчитпно...
6. Удары посчитаны...
7. Удары в створ посчитаны...
8. Жёлтые карточки посчитаны...
9. Красные карточки посчитаны...
10. Замены посчитаны...
11. Угловые посчитаны...
12. Линии посчитаны...


In [20]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10391998 entries, 0 to 10392039
Data columns (total 44 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Id                int32  
 1   Result1           int16  
 2   Result2           int16  
 3   min_norm          float32
 4   Score1_norm       float32
 5   Score2_norm       float32
 6   Score_diff        float32
 7   Score_cat_1       uint8  
 8   Score_cat_2       uint8  
 9   Score_cat_3       uint8  
 10  Score_cat_4       uint8  
 11  Score_cat_5       uint8  
 12  Score_cat_6       uint8  
 13  Score_cat_7       uint8  
 14  Score_cat_8       uint8  
 15  Score_cat_9       uint8  
 16  A1_scaled         float32
 17  A2_scaled         float32
 18  A1perMIN          float32
 19  A2perMIN          float32
 20  A1relativ         float32
 21  A2relativ         float32
 22  DA1_scaled        float32
 23  DA2_scaled        float32
 24  DA1perMIN         float32
 25  DA2perMIN         float32
 26  DA1relativ  

#### простое Версионирование под бусты

##### Home vs Away + Draw

In [21]:
id_train = data_df['Id'].values
id_test = test_df['Id'].values

In [22]:
y_train = np.array((data_df['Result1'] > data_df['Result2']) *1, dtype = np.int8)
y_test = np.array((test_df['Result1'] > test_df['Result2']) *1, dtype = np.int8)

In [24]:
X_train = data_df.drop(['Id', 'Result1', 'Result2'], axis = 1).values
X_test = test_df.drop(['Id', 'Result1', 'Result2'], axis = 1).values

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10391998, 41), (2598283, 41), (10391998,), (2598283,))

In [25]:
np.savez_compressed('./dataset', 
                    id_train = id_train,
                    id_test = id_test,
                    y_train = y_train,
                    y_test = y_test,
                    X_train = X_train,
                    X_test = X_test
                )

In [None]:
#data_df.to_csv('data_df.csv.gz', index = False, compression={'method': 'gzip'})

In [27]:
params = {}
params['description'] = 'Первый датасет под бусты простые фичи'

In [28]:
data_version = 'football_live_npz_230105/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'dataset'].upload('./dataset.npz')
project[data_version + 'params'] = params
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.
All 2 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Catboost

#### Installation

In [29]:
!pip install catboost >> None

#### Imports

In [30]:
from catboost import CatBoost
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
np.random.seed(147)

#### Create Tensors

In [31]:
train_data = Pool(X_train, y_train)
test_data = Pool(X_test, y_test)

In [32]:
booster = CatBoostClassifier(iterations=10)

In [33]:
booster.fit(train_data, eval_set=test_data)

Learning rate set to 0.5
0:	learn: 0.5837773	test: 0.5838870	best: 0.5838870 (0)	total: 2.65s	remaining: 23.8s
1:	learn: 0.5604733	test: 0.5590931	best: 0.5590931 (1)	total: 4.67s	remaining: 18.7s
2:	learn: 0.5508504	test: 0.5508177	best: 0.5508177 (2)	total: 6.51s	remaining: 15.2s
3:	learn: 0.5480869	test: 0.5477653	best: 0.5477653 (3)	total: 8.48s	remaining: 12.7s
4:	learn: 0.5461828	test: 0.5462240	best: 0.5462240 (4)	total: 10.4s	remaining: 10.4s
5:	learn: 0.5454131	test: 0.5455072	best: 0.5455072 (5)	total: 12.3s	remaining: 8.2s
6:	learn: 0.5445616	test: 0.5449140	best: 0.5449140 (6)	total: 14s	remaining: 5.99s
7:	learn: 0.5442970	test: 0.5445947	best: 0.5445947 (7)	total: 15.9s	remaining: 3.97s
8:	learn: 0.5438550	test: 0.5442128	best: 0.5442128 (8)	total: 17.8s	remaining: 1.98s
9:	learn: 0.5435059	test: 0.5438959	best: 0.5438959 (9)	total: 19.6s	remaining: 0us

bestTest = 0.5438958907
bestIteration = 9



<catboost.core.CatBoostClassifier at 0x7f974f2737f0>

In [36]:
print("Test  Accuracy : %.4f"%booster.score(train_data))
print("Train Accuracy : %.4f"%booster.score(test_data))

Test  Accuracy : 0.7160
Train Accuracy : 0.7157
