<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_live_experimental_heft_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Project config

In [1]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [2]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

### Downloads

In [3]:
data_version = 'football_live_221229/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'data.rar'].download('./data.rar')
project[data_version + 'info.rar'].download('./info.rar')
project[data_version + 'prem.rar'].download('./prem.rar')
project[data_version + 'train_id.csv'].download('./train_id.csv')
project[data_version + 'test_id.csv'].download('./test_id.csv')
project[data_version + 'holdout_id.csv'].download('./holdout_id.csv')
project[data_version + 'data_train.csv.gz'].download('./data_train.csv.gz')
project[data_version + 'data_test.csv.gz'].download('./data_test.csv.gz')
project[data_version + 'data_hold.csv.gz'].download('./data_hold.csv.gz')

#del project[data_version]
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [4]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

#import dask.dataframe as dd
import subprocess
import sys
from glob import glob
from tqdm import tqdm

1.3.5
1.21.6


In [5]:
import plotly.express as px

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

### Code

#####Functions

In [7]:
def run_bash(bashCommand:str, nameCommand = ''):
        process = subprocess.Popen([bashCommand], 
                           shell=True)
        _, error = process.communicate()
        if error:
            print(f'{nameCommand} error:\n', error)

In [8]:
# Присоединяем итоговый результат и по первому тайму
def add_match_results (data_df:pd.DataFrame, cols = ['Id', 'Result1', 'Result2', 'Periods'], info_path = './info.csv'):
    info_df = pd.read_csv(info_path, sep = ';', usecols = cols)
    info_df[['Period1', 'Period2', 'Period3', 'Period4']] = info_df['Periods'].str.split(',', expand = True)
    info_df[['Time1Res1', 'Time1Res2']] = info_df['Period1'].str.split(':', expand = True)
    info_df = info_df[~info_df['Id'].duplicated(keep = False)]
    period1_result_dict = info_df.set_index('Id')[['Time1Res1', 'Time1Res2', 'Result1', 'Result2']].to_dict(orient = 'index')
    #data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] =  \
    return [[period1_result_dict[id]['Time1Res1'], period1_result_dict[id]['Time1Res1'], 
    period1_result_dict[id]['Result1'], period1_result_dict[id]['Result2']] 
    if id in period1_result_dict else [None, None, None, None] for id in tqdm(data_df['Id'].values, total = len(data_df))]
    

In [9]:
# Добавляем прематчевые линии
def add_match_lines (data_df:pd.DataFrame, cols = ['P1', 'PX', 'P2', 'PR'], prem_path = './prem.csv'):
    prem_df = pd.read_csv(prem_path, sep = ';')
    prem_df = prem_df[~prem_df['Id'].duplicated(keep = False)]
    prem_dict = prem_df.set_index('Id')[cols].to_dict(orient = 'index')
    #data_df.loc[:,['P1', 'PX', 'P2', 'PR']] =  \
    return [[prem_dict[id]['P1'], prem_dict[id]['PX'], prem_dict[id]['P2'], prem_dict[id]['PR']] 
    if id in prem_dict else [None, None, None, None] for id in tqdm(data_df['Id'].values, total = len(data_df))]

In [10]:
def transform_dataset(data_df:pd.DataFrame, remain = []):

    # трансформируем минуты
    data_df['min_norm'] = data_df['Minute'].astype(np.float32) / 50
    print('1. Минуты посчитаны...')
    # трансформируем голы
    data_df['Score1_norm'] = data_df['Score1'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score1'] > 3, ['Score1_norm']] = 1.0
    data_df['Score2_norm'] = data_df['Score2'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score2'] > 3, ['Score2_norm']] = 1.0

    data_df['Score_diff'] = data_df['Score1'].astype(np.int16) - data_df['Score2'].astype(np.int16)
    data_df.loc[data_df['Score_diff'] < -4, ['Score_diff']] = -4
    data_df.loc[data_df['Score_diff'] > 4, ['Score_diff']] = 4
    data_df[[f'Score_cat_{n}' for n in range(1, 10)]] = pd.get_dummies(data_df['Score_diff']).values
    data_df['Score_diff'] = data_df['Score_diff'].astype(np.float32) / np.float32(4.0)
    if 'Score1' not in remain:
        data_df = data_df.drop(['Score1', 'Score2'], axis = 1)
    print('2. Голы посчитаны...')
    #трансформируем атаки
    data_df['A1_scaled'] = data_df['A1'].astype(np.float32) / 75
    data_df.loc[data_df['A1'] >= 60, ['A1_scaled']] = (60 + (data_df['A1'] - 60) / 4) / 75
    data_df['A2_scaled'] = data_df['A2'].astype(np.float32) / 75
    data_df.loc[data_df['A2'] >= 60, ['A2_scaled']] = (60 + (data_df['A2'] - 60) / 4) / 75
    # атаки в минуту
    data_df['A1perMIN'] = data_df['A1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A1perMIN'] > 4, ['A1perMIN']] = np.float32(4.0)
    data_df['A2perMIN'] = data_df['A2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A2perMIN'] > 4, ['A2perMIN']] = np.float32(4.0)
    # динамика атак
    data_df['A1relativ'] = data_df['A1'].astype(np.float32) - data_df['A1'].shift(5).astype(np.float32)
    data_df.loc[data_df['A1relativ'] < 0, ['A1relativ']] = np.float32(0.0)
    data_df['A1relativ'] = data_df['A1relativ'].fillna(0)
    data_df.loc[data_df['A1relativ'] > 15, ['A1relativ']] = np.float32(15.)
    data_df['A2relativ'] = data_df['A2'].astype(np.float32) - data_df['A2'].shift(5).astype(np.float32)
    data_df.loc[data_df['A2relativ'] < 0, ['A2relativ']] =  np.float32(0.0)
    data_df['A2relativ'] = data_df['A2relativ'].fillna(0)
    data_df.loc[data_df['A2relativ'] > 15, ['A2relativ']] = np.float32(15.)
    if 'A1' not in remain:
        data_df = data_df.drop(['A1', 'A2'], axis = 1)
    print('3. Атаки посчитаны...')
    # трансформируем опасные атаки
    data_df['DA1_scaled'] = data_df['DA1'].astype(np.float32) / 50
    data_df.loc[data_df['DA1'] >= 40, ['DA1_scaled']] = (80 + (data_df['DA1'] - 40) / 3) / 100
    data_df['DA2_scaled'] = data_df['DA2'].astype(np.float32) / 50
    data_df.loc[data_df['DA2'] >= 40, ['DA2_scaled']] = (80 + (data_df['DA2'] - 40) / 3) / 100
    # опасные атаки в минуту    
    data_df['DA1perMIN'] = data_df['DA1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA1perMIN'] > 3, ['DA1perMIN']] = np.float32(3.0)
    data_df['DA2perMIN'] = data_df['DA2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA2perMIN'] > 3, ['DA2perMIN']] = np.float32(3.0)
    # динамика опасных атак
    data_df['DA1relativ'] = data_df['DA1'].astype(np.float32) - data_df['DA1'].shift(5).astype(np.float32)
    data_df.loc[data_df['DA1relativ'] < 0, ['DA1relativ']] = np.float32(0.0)
    data_df['DA1relativ'] = data_df['DA1relativ'].fillna(0)
    data_df.loc[data_df['DA1relativ'] > 10, ['DA1relativ']] = np.float32(10.)
    data_df['DA2relativ'] = data_df['DA2'].astype(np.float32) - data_df['DA2'].shift(5).astype(np.float32)
    data_df.loc[data_df['DA2relativ'] < 0, ['DA2relativ']] = np.float32(0.0)
    data_df['DA2relativ'] = data_df['DA2relativ'].fillna(0)
    data_df.loc[data_df['DA2relativ'] > 10, ['DA2relativ']] = np.float32(10.)
    if 'DA1' not in remain:
        data_df = data_df.drop(['DA1', 'DA2'], axis = 1)
    if 'Minute' not in remain:
        data_df = data_df.drop(['Minute'], axis = 1)
    print('4. Опасные атаки посчитаны...')
    # трансформируем владение мячом
    data_df['Pos1_cleaned'] = data_df['Pos1'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos1_cleaned'] < 0.2, ['Pos1_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos1_cleaned'] > 0.8, ['Pos1_cleaned']] = np.float32(0.8)
    data_df['Pos2_cleaned'] = data_df['Pos2'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos2_cleaned'] < 0.2, ['Pos2_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos2_cleaned'] > 0.8, ['Pos2_cleaned']] = np.float32(0.8)
    if 'Pos1' not in remain:
        data_df = data_df.drop(['Pos1', 'Pos2'], axis = 1)
    print('5. Владение мячом посчитпно...')
    # трансформируем удары
    data_df['Off1_norm'] = data_df['Off1'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off1_norm'] > 1.0, ['Off1_norm']] = np.float32(1.0)
    data_df['Off2_norm'] = data_df['Off2'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off2_norm'] > 1.0, ['Off2_norm']] = np.float32(1.0)
    if 'Off1' not in remain:
        data_df = data_df.drop(['Off1', 'Off2'], axis = 1)
    print('6. Удары посчитаны...')
    # трансформируем удары в створ
    data_df['On1_norm'] = data_df['On1'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On1_norm'] > 1.0, ['On1_norm']] = np.float32(1.0)
    data_df['On2_norm'] = data_df['On2'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On2_norm'] > 1.0, ['On2_norm']] = np.float32(1.0)
    if 'On1' not in remain:
        data_df = data_df.drop(['On1', 'On2'], axis = 1)    
    print('7. Удары в створ посчитаны...')
    # трансформируем желтые карточки
    data_df['YC1_transformed'] = data_df['YC1'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC1_transformed'] > 1.0, ['YC1_transformed']] = np.float32(1.0)
    data_df['YC2_transformed'] = data_df['YC2'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC2_transformed'] > 1.0, ['YC2_transformed']] = np.float32(1.0)
    if 'YC1' not in remain:
        data_df = data_df.drop(['YC1', 'YC2'], axis = 1)
    print('8. Жёлтые карточки посчитаны...')
    # трансформируем красные карточки
    data_df['RC1_transformed'] = data_df['RC1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC1_transformed'] > 1, ['RC1_transformed']] = np.int8(1)
    data_df['RC2_transformed'] = data_df['RC2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC2_transformed'] > 1, ['RC2_transformed']] = np.int8(1)
    if 'RC1' not in remain:
        data_df = data_df.drop(['RC1', 'RC2'], axis = 1)
    print('9. Красные карточки посчитаны...')
    # трансформируем замены
    data_df['Sub1_transformed'] = data_df['Sub1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub1_transformed'] > 1, ['Sub1_transformed']] = np.int8(1)
    data_df['Sub2_transformed'] = data_df['Sub2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub2_transformed'] > 1, ['Sub2_transformed']] = np.int8(1)
    if 'Sub1' not in remain:
        data_df = data_df.drop(['Sub1', 'Sub2'], axis = 1)
    print('10. Замены посчитаны...')
    # трансформируем угловык
    data_df['Cor1_transformed'] = data_df['Cor1'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor1_transformed'] > 1.0, ['Cor1_transformed']] = np.float32(1.0)
    data_df['Cor2_transformed'] = data_df['Cor2'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor2_transformed'] > 1.0, ['Cor2_transformed']] = np.float32(1.0)
    if 'Cor1' not in remain:
        data_df = data_df.drop(['Cor1', 'Cor2'], axis = 1)
    print('11. Угловые посчитаны...')
    # трансформируем линию
    data_df['P1_transformed'] = np.log(data_df['P1'], dtype = np.float32) / 2
    data_df['P2_transformed'] = np.log(data_df['P2'], dtype = np.float32) / 2
    if 'P1' not in remain:
        data_df = data_df.drop(['P1', 'P2'], axis = 1)
    print('12. Линии посчитаны...')
    if 'Pen1' not in remain:
        data_df = data_df.drop(['Pen1', 'Pen2'], axis = 1)
    if 'Active' not in remain:
        data_df = data_df.drop(['Active'], axis = 1)
    return data_df

#### Unpack

In [11]:
for file in glob('./*'):    
    if '.rar' in file:
        print(file)
        bash = f'unrar e {file} && rm {file}'
        run_bash(bash)

./data.rar
./info.rar
./prem.rar


### Preprocessing & Feature engineering

#### Test set

In [12]:
data_df = pd.read_csv(
    './data_test.csv.gz', 
    dtype = {
        'Id':np.int32, 'Minute':np.int16, 'Active':np.int8,
        'Score1':np.int16, 'Score2':np.int16, 'A1':np.int16, 'A2':np.int16,
        'DA1':np.int16, 'DA2':np.int16,'Pos1':np.float32, 'Pos2':np.float32,
        'Off1':np.int16, 'Off2':np.int16,'On1':np.int16, 'On2':np.int16,
        'YC1':np.float32, 'YC2':np.float32, 'RC1':np.float32, 'RC2':np.float32,
        'Sub1':np.float32, 'Sub2':np.float32, 'Pen1':np.float32, 'Pen2':np.float32,
        'Cor1':np.float32, 'Cor2':np.float32,
        }
    )

In [13]:
## Добавляем финальный результатк тесту
data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] = add_match_results (data_df)
isdigit = [str(a) for a in range(20)]
data_df.loc[~data_df['Time1Res1'].isin(isdigit), ['Time1Res1']] = -1
data_df.loc[~data_df['Time1Res2'].isin(isdigit), ['Time1Res2']] = -1
data_df[['Time1Res1', 'Time1Res2']] = data_df[['Time1Res1', 'Time1Res2']].astype(np.int16)
data_df[['Result1', 'Result2']] = data_df[['Result1', 'Result2']].astype(np.int16)
data_df = data_df.drop(['Time1Res1', 'Time1Res2'], axis = 1)

100%|██████████| 2598283/2598283 [00:27<00:00, 93644.65it/s] 


In [14]:
## Добавляем линии к тесту
data_df.loc[:,['P1', 'PX', 'P2', 'PR']] = add_match_lines (data_df)
data_df[['P1', 'PX', 'P2', 'PR']] = data_df[['P1', 'PX', 'P2', 'PR']].astype(np.float32)
data_df = data_df.drop(['PX', 'PR'], axis = 1)

100%|██████████| 2598283/2598283 [00:28<00:00, 89875.62it/s] 


In [15]:
data_df = transform_dataset(data_df, #) 
    remain = ['Score1', 'Score2'])
    #    ['Id', 'Minute', 'Active', 'Score1', 'Score2', 'A1', 'A2', 'DA1', 'DA2',
    #   'Pos1', 'Pos2', 'Off1', 'Off2', 'On1', 'On2', 'YC1', 'YC2', 'RC1',
    #   'RC2', 'Sub1', 'Sub2', 'Pen1', 'Pen2', 'Cor1', 'Cor2', 'P1', 'P2']
    #   )

1. Минуты посчитаны...
2. Голы посчитаны...
3. Атаки посчитаны...
4. Опасные атаки посчитаны...
5. Владение мячом посчитпно...
6. Удары посчитаны...
7. Удары в створ посчитаны...
8. Жёлтые карточки посчитаны...
9. Красные карточки посчитаны...
10. Замены посчитаны...
11. Угловые посчитаны...
12. Линии посчитаны...


In [16]:
test_df = data_df.copy(deep = True)

#### Train set

In [17]:
data_df = pd.read_csv(
    './data_train.csv.gz', 
    dtype = {
        'Id':np.int32, 'Minute':np.int16, 'Active':np.int8,
        'Score1':np.int16, 'Score2':np.int16, 'A1':np.int16, 'A2':np.int16,
        'DA1':np.int16, 'DA2':np.int16,'Pos1':np.float32, 'Pos2':np.float32,
        'Off1':np.int16, 'Off2':np.int16,'On1':np.int16, 'On2':np.int16,
        'YC1':np.float32, 'YC2':np.float32, 'RC1':np.float32, 'RC2':np.float32,
        'Sub1':np.float32, 'Sub2':np.float32, 'Pen1':np.float32, 'Pen2':np.float32,
        'Cor1':np.float32, 'Cor2':np.float32,
        }
    )

In [18]:
data_df.loc[:, ['Time1Res1', 'Time1Res2', 'Result1', 'Result2']] = add_match_results (data_df)
isdigit = [str(a) for a in range(20)]
data_df.loc[~data_df['Time1Res1'].isin(isdigit), ['Time1Res1']] = -1
data_df.loc[~data_df['Time1Res2'].isin(isdigit), ['Time1Res2']] = -1
data_df[['Time1Res1', 'Time1Res2']] = data_df[['Time1Res1', 'Time1Res2']].astype(np.int16)
data_df[['Result1', 'Result2']] = data_df[['Result1', 'Result2']].astype(np.int16)
data_df = data_df.drop(['Time1Res1', 'Time1Res2'], axis = 1)

100%|██████████| 10392040/10392040 [01:46<00:00, 97740.21it/s]


In [19]:
data_df.loc[:,['P1', 'PX', 'P2', 'PR']] = add_match_lines (data_df)
data_df[['P1', 'PX', 'P2', 'PR']] = data_df[['P1', 'PX', 'P2', 'PR']].astype(np.float32)
data_df = data_df.drop(['PX', 'PR'], axis = 1)

100%|██████████| 10392040/10392040 [01:47<00:00, 96860.01it/s] 


In [20]:
if 1898989 in data_df['Id']:
    data_df = data_df[data_df['Id'] != 1898989]

In [21]:
data_df = transform_dataset(data_df, #)
    remain = ['Score1', 'Score2'])
    #data_df, 
    #remain = 
    #    ['Id', 'Minute', 'Active', 'Score1', 'Score2', 'A1', 'A2', 'DA1', 'DA2',
    #   'Pos1', 'Pos2', 'Off1', 'Off2', 'On1', 'On2', 'YC1', 'YC2', 'RC1',
    #   'RC2', 'Sub1', 'Sub2', 'Pen1', 'Pen2', 'Cor1', 'Cor2', 'P1', 'P2']
    #   )

1. Минуты посчитаны...
2. Голы посчитаны...
3. Атаки посчитаны...
4. Опасные атаки посчитаны...
5. Владение мячом посчитпно...
6. Удары посчитаны...
7. Удары в створ посчитаны...
8. Жёлтые карточки посчитаны...
9. Красные карточки посчитаны...
10. Замены посчитаны...
11. Угловые посчитаны...
12. Линии посчитаны...


In [22]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10391998 entries, 0 to 10392039
Data columns (total 46 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Id                int32  
 1   Score1            int16  
 2   Score2            int16  
 3   Result1           int16  
 4   Result2           int16  
 5   min_norm          float32
 6   Score1_norm       float32
 7   Score2_norm       float32
 8   Score_diff        float32
 9   Score_cat_1       uint8  
 10  Score_cat_2       uint8  
 11  Score_cat_3       uint8  
 12  Score_cat_4       uint8  
 13  Score_cat_5       uint8  
 14  Score_cat_6       uint8  
 15  Score_cat_7       uint8  
 16  Score_cat_8       uint8  
 17  Score_cat_9       uint8  
 18  A1_scaled         float32
 19  A2_scaled         float32
 20  A1perMIN          float32
 21  A2perMIN          float32
 22  A1relativ         float32
 23  A2relativ         float32
 24  DA1_scaled        float32
 25  DA2_scaled        float32
 26  DA1perMIN   

In [23]:
data_df = data_df[~data_df['P1_transformed'].isna()]
data_df = data_df[~data_df['P2_transformed'].isna()]

In [24]:
data_df = data_df.loc[(data_df['Result1'] - data_df['Score1']) >= 0]
data_df = data_df.loc[(data_df['Result2'] - data_df['Score2']) >= 0]

In [25]:
test_df = test_df[~test_df['P1_transformed'].isna()]
test_df = test_df[~test_df['P2_transformed'].isna()]

In [26]:
test_df = test_df.loc[(test_df['Result1'] - test_df['Score1']) >= 0]
test_df = test_df.loc[(test_df['Result2'] - test_df['Score2']) >= 0]

#### простое Версионирование под бусты

##### Home vs Away + Draw

In [27]:
id_train = data_df['Id'].values
id_test = test_df['Id'].values

In [28]:
X_train = data_df.drop(['Id', 'Score1', 'Score2', 'Result1', 'Result2'], axis = 1).values
X_test = test_df.drop(['Id', 'Score1', 'Score2', 'Result1', 'Result2'], axis = 1).values

In [29]:
y_train_bin = np.array((data_df['Result1'] > data_df['Result2']) *1, dtype = np.int8)
y_test_bin = np.array((test_df['Result1'] > test_df['Result2']) *1, dtype = np.int8)

In [30]:
X_train.shape, X_test.shape, y_train_bin.shape, y_test_bin.shape, 

((10310810, 41), (2576417, 41), (10310810,), (2576417,))

In [31]:
y_train_multi = np.sign((data_df['Result1'] - data_df['Result2']).values) + 1
y_test_multi = np.sign((test_df['Result1'] - test_df['Result2']).values) + 1

In [32]:
y_train_multi.shape, y_test_multi.shape

((10310810,), (2576417,))

In [33]:
y_train_diff = (data_df['Result1'] - data_df['Result2']).values
y_test_diff = (test_df['Result1'] - test_df['Result2']).values

In [34]:
y_train_diff.shape, y_test_diff.shape

((10310810,), (2576417,))

In [35]:
y_train_regression1 = ((data_df['Result1'] - data_df['Score1']) / 21).values
y_train_regression2 = ((data_df['Result2'] - data_df['Score2']) / 21).values
y_test_regression1 = ((test_df['Result1'] - test_df['Score1']) / 21).values
y_test_regression2 = ((test_df['Result2'] - test_df['Score2']) / 21).values

In [37]:
np.savez_compressed('./dataset', 
                    id_train = id_train,
                    id_test = id_test,
                    X_train = X_train,
                    X_test = X_test,
                    y_train_bin = y_train_bin,
                    y_test_bin = y_test_bin,
                    y_train_multi = y_train_multi,
                    y_test_multi = y_test_multi,
                    y_train_diff = y_train_diff,
                    y_test_diff = y_test_diff,
                    y_train_regression1 =y_train_regression1,
                    y_train_regression2 = y_train_regression2,
                    y_test_regression1 = y_test_regression1,
                    y_test_regression2 = y_test_regression2,
                    score1_train = data_df['Score1'].values,
                    score2_train = data_df['Score2'].values,
                    result1_train = data_df['Result1'].values,
                    result2_train = data_df['Result2'].values,
                    score1_test = test_df['Score1'].values,
                    score2_test = test_df['Score2'].values,
                    result1_test = test_df['Result1'].values,
                    result2_test = test_df['Result2'].values
                )

In [None]:
#data_df.to_csv('data_df.csv.gz', index = False, compression={'method': 'gzip'})

In [38]:
params = {}
params['description'] = 'датасет под бусты \n' + \
'удалены строки с nan в P1, P2 \n' + \
'простые фичи, \n' + \
'4 класса задач, бинарная классификация и мультикласс, и под регрессию 2 варианта '
params['features'] = list(data_df.columns)[5:]

In [39]:
data_version = 'football_live_npz_230105/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'dataset'].upload('./dataset.npz')
project[data_version + 'params'] = params
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 3 operations to synchronize with Neptune. Do not kill this process.


  value = StringVal(value)


All 3 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Catboost

#### Installation

In [None]:
!pip install catboost >> None

#### Imports

In [None]:
from catboost import CatBoost
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
np.random.seed(147)

#### Create Tensors

In [None]:
train_data = Pool(X_train, y_train_bin)
test_data = Pool(X_test, y_test_bin)

In [None]:
booster = CatBoostClassifier(iterations=25)

In [None]:
booster.fit(train_data, eval_set=test_data)

Learning rate set to 0.5
0:	learn: 0.5837773	test: 0.5838870	best: 0.5838870 (0)	total: 3.69s	remaining: 1m 28s
1:	learn: 0.5604733	test: 0.5590931	best: 0.5590931 (1)	total: 6.71s	remaining: 1m 17s
2:	learn: 0.5508504	test: 0.5508177	best: 0.5508177 (2)	total: 9.45s	remaining: 1m 9s
3:	learn: 0.5480869	test: 0.5477653	best: 0.5477653 (3)	total: 12.4s	remaining: 1m 5s
4:	learn: 0.5461828	test: 0.5462240	best: 0.5462240 (4)	total: 15.3s	remaining: 1m 1s
5:	learn: 0.5454131	test: 0.5455072	best: 0.5455072 (5)	total: 18.1s	remaining: 57.4s
6:	learn: 0.5445616	test: 0.5449140	best: 0.5449140 (6)	total: 20.6s	remaining: 52.9s
7:	learn: 0.5442970	test: 0.5445947	best: 0.5445947 (7)	total: 23.4s	remaining: 49.7s
8:	learn: 0.5438550	test: 0.5442128	best: 0.5442128 (8)	total: 26.3s	remaining: 46.8s
9:	learn: 0.5435059	test: 0.5438959	best: 0.5438959 (9)	total: 29.1s	remaining: 43.6s
10:	learn: 0.5431863	test: 0.5436360	best: 0.5436360 (10)	total: 31.8s	remaining: 40.5s
11:	learn: 0.5429340	test

<catboost.core.CatBoostClassifier at 0x7fbdcb3883a0>

In [None]:
print("Test  Accuracy : %.4f"%booster.score(train_data))
print("Train Accuracy : %.4f"%booster.score(test_data))

Test  Accuracy : 0.7169
Train Accuracy : 0.7164


### DEBUG PART

In [None]:
drop_transformed = True

In [None]:
data_df

Unnamed: 0,Id,Minute,Active,Score1,Score2,A1,A2,DA1,DA2,Pos1,Pos2,Off1,Off2,On1,On2,YC1,YC2,RC1,RC2,Sub1,Sub2,Pen1,Pen2,Cor1,Cor2,Result1,Result2,P1,P2
0,333041,1,1,0,0,0,1,0,1,0.0,100.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1,2.1,3.8
1,333041,2,1,0,0,1,2,1,2,20.0,80.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1,2.1,3.8
2,333041,3,1,0,0,2,2,1,2,28.0,72.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1,2.1,3.8
3,333041,4,1,0,0,2,3,2,3,49.0,51.0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1,2.1,3.8
4,333041,5,1,0,0,2,3,2,3,40.0,60.0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2,1,2.1,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598278,5196533,40,1,0,0,34,22,19,15,,,0,0,1,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1,0,2.3,3.3
2598279,5196533,41,1,0,0,34,25,19,15,,,0,0,1,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1,0,2.3,3.3
2598280,5196533,42,1,0,0,34,26,19,15,,,0,0,1,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1,0,2.3,3.3
2598281,5196533,43,1,0,0,34,27,19,16,,,0,0,1,0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1,0,2.3,3.3


In [None]:
data_df['min_norm'] = data_df['Minute'].astype(np.float32) / 50

In [None]:
data_df['Score1_norm'] = data_df['Score1'].fillna(method = 'ffill').astype(np.float32) / 4
data_df.loc[data_df['Score1'] > 3, ['Score1_norm']] = 1.0
data_df['Score2_norm'] = data_df['Score2'].fillna(method = 'ffill').astype(np.float32) / 4
data_df.loc[data_df['Score2'] > 3, ['Score2_norm']] = 1.0

data_df['Score_diff'] = data_df['Score1'].astype(np.float32) - data_df['Score2'].astype(np.float32)
data_df.loc[data_df['Score_diff'] < -4, ['Score_diff']] = -4
data_df.loc[data_df['Score_diff'] > 4, ['Score_diff']] = 4
data_df[[f'Score_cat_{n}' for n in range(1, 10)]] = pd.get_dummies(data_df['Score_diff']).values
data_df['Score_diff'] = data_df['Score_diff'].astype(np.float32) / np.float32(4.0)

if drop_transformed:
    data_df = data_df.drop(['Score1', 'Score2'], axis = 1)

In [None]:
data_df['A1_scaled'] = data_df['A1'].astype(np.float32) / 100
data_df.loc[data_df['A1'] >= 80, ['A1_scaled']] = (80 + (data_df['A1'] - 80) / 5) / 100
data_df['A2_scaled'] = data_df['A2'].astype(np.float32) / 100
data_df.loc[data_df['A2'] >= 80, ['A2_scaled']] = (80 + (data_df['A2'] - 80) / 5) / 100

data_df['A1perMIN'] = data_df['A1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
data_df.loc[data_df['A1perMIN'] > 4, ['A1perMIN']] = np.float32(4.0)
data_df['A2perMIN'] = data_df['A2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
data_df.loc[data_df['A2perMIN'] > 4, ['A2perMIN']] = np.float32(4.0)

data_df['A1relativ'] = data_df['A1'].astype(np.float32) - data_df['A1'].shift(5).astype(np.float32)
data_df.loc[data_df['A1relativ'] < 0, ['A1relativ']] = np.float32(0.0)
data_df['A1relativ'] = data_df['A1relativ'].fillna(0)
data_df.loc[data_df['A1relativ'] > 15, ['A1relativ']] = np.float32(15.)
data_df['A2relativ'] = data_df['A2'].astype(np.float32) - data_df['A2'].shift(5).astype(np.float32)
data_df.loc[data_df['A2relativ'] < 0, ['A2relativ']] =  np.float32(0.0)
data_df['A2relativ'] = data_df['A2relativ'].fillna(0)
data_df.loc[data_df['A2relativ'] > 15, ['A2relativ']] = np.float32(15.)

if drop_transformed:
    data_df = data_df.drop(['A1', 'A2'], axis = 1)

In [None]:
data_df['DA1_scaled'] = data_df['DA1'].astype(np.float32) / 100
data_df.loc[data_df['DA1'] >= 80, ['DA1_scaled']] = (80 + (data_df['DA1'] - 80) / 5) / 100
data_df['DA2_scaled'] = data_df['DA2'].astype(np.float32) / 100
data_df.loc[data_df['DA2'] >= 80, ['DA2_scaled']] = (80 + (data_df['DA2'] - 80) / 5) / 100

data_df['DA1perMIN'] = data_df['DA1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
data_df.loc[data_df['DA1perMIN'] > 3, ['DA1perMIN']] = np.float32(3.0)
data_df['DA2perMIN'] = data_df['DA2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
data_df.loc[data_df['DA2perMIN'] > 3, ['DA2perMIN']] = np.float32(3.0)

data_df['DA1relativ'] = data_df['DA1'].astype(np.float32) - data_df['DA1'].shift(5).astype(np.float32)
data_df.loc[data_df['DA1relativ'] < 0, ['DA1relativ']] = np.float32(0.0)
data_df['DA1relativ'] = data_df['DA1relativ'].fillna(0)
data_df.loc[data_df['DA1relativ'] > 10, ['DA1relativ']] = np.float32(10.)
data_df['DA2relativ'] = data_df['DA2'].astype(np.float32) - data_df['DA2'].shift(5).astype(np.float32)
data_df.loc[data_df['DA2relativ'] < 0, ['DA2relativ']] = np.float32(0.0)
data_df['DA2relativ'] = data_df['DA2relativ'].fillna(0)
data_df.loc[data_df['DA2relativ'] > 10, ['DA2relativ']] = np.float32(10.)

if drop_transformed:
    data_df = data_df.drop(['DA1', 'DA2'], axis = 1)

In [None]:
data_df['Pos1_cleaned'] = data_df['Pos1'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
data_df.loc[data_df['Pos1_cleaned'] < 0.2, ['Pos1_cleaned']] = np.float32(0.2)
data_df.loc[data_df['Pos1_cleaned'] > 0.8, ['Pos1_cleaned']] = np.float32(0.8)
data_df['Pos2_cleaned'] = data_df['Pos2'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
data_df.loc[data_df['Pos2_cleaned'] < 0.2, ['Pos2_cleaned']] = np.float32(0.2)
data_df.loc[data_df['Pos2_cleaned'] > 0.8, ['Pos2_cleaned']] = np.float32(0.8)

if drop_transformed:
    data_df = data_df.drop(['Pos1', 'Pos2'], axis = 1)

In [None]:
data_df['Off1_norm'] = data_df['Off1'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
data_df.loc[data_df['Off1_norm'] > 1.0, ['Off1_norm']] = np.float32(1.0)
data_df['Off2_norm'] = data_df['Off2'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
data_df.loc[data_df['Off2_norm'] > 1.0, ['Off2_norm']] = np.float32(1.0)

if drop_transformed:
    data_df = data_df.drop(['Off1', 'Off2'], axis = 1)

In [None]:
data_df['On1_norm'] = data_df['On1'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
data_df.loc[data_df['On1_norm'] > 1.0, ['On1_norm']] = np.float32(1.0)
data_df['On2_norm'] = data_df['On2'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
data_df.loc[data_df['On2_norm'] > 1.0, ['On2_norm']] = np.float32(1.0)

if drop_transformed:
    data_df = data_df.drop(['On1', 'On2'], axis = 1)

In [None]:
data_df['YC1_transformed'] = data_df['YC1'].fillna(0).astype(np.float32) / np.float32(2.0)
data_df.loc[data_df['YC1_transformed'] > 1.0, ['YC1_transformed']] = np.float32(1.0)
data_df['YC2_transformed'] = data_df['YC2'].fillna(0).astype(np.float32) / np.float32(2.0)
data_df.loc[data_df['YC2_transformed'] > 1.0, ['YC2_transformed']] = np.float32(1.0)

if drop_transformed:
    data_df = data_df.drop(['YC1', 'YC2'], axis = 1)

In [None]:
data_df['RC1_transformed'] = data_df['RC1'].fillna(0).astype(np.int8)
data_df.loc[data_df['RC1_transformed'] > 1, ['RC1_transformed']] = np.int8(1)
data_df['RC2_transformed'] = data_df['RC2'].fillna(0).astype(np.int8)
data_df.loc[data_df['RC2_transformed'] > 1, ['RC2_transformed']] = np.int8(1)

if drop_transformed:
    data_df = data_df.drop(['RC1', 'RC2'], axis = 1)

In [None]:
data_df['Sub1_transformed'] = data_df['Sub1'].fillna(0).astype(np.int8)
data_df.loc[data_df['Sub1_transformed'] > 1, ['Sub1_transformed']] = np.int8(1)
data_df['Sub2_transformed'] = data_df['Sub2'].fillna(0).astype(np.int8)
data_df.loc[data_df['Sub2_transformed'] > 1, ['Sub2_transformed']] = np.int8(1)

if drop_transformed:
    data_df = data_df.drop(['Sub1', 'Sub2'], axis = 1)

In [None]:
data_df['Cor1_transformed'] = data_df['Cor1'].fillna(0).astype(np.float32) / np.float32(6.0)
data_df.loc[data_df['Cor1_transformed'] > 1.0, ['Cor1_transformed']] = np.float32(1.0)
data_df['Cor2_transformed'] = data_df['Cor2'].fillna(0).astype(np.float32) / np.float32(6.0)
data_df.loc[data_df['Cor2_transformed'] > 1.0, ['Cor2_transformed']] = np.float32(1.0)

if drop_transformed:
    data_df = data_df.drop(['Cor1', 'Cor2'], axis = 1)

In [None]:
data_df['P1_transformed'] = np.log(data_df['P1'], dtype = np.float32) / 2
data_df['P2_transformed'] = np.log(data_df['P2'], dtype = np.float32) / 2

if drop_transformed:
    data_df = data_df.drop(['P1', 'P2'], axis = 1)

In [None]:
data_df.to_csv('data_train_transformed.csv.gz', index = False, compression={'method': 'gzip'})

In [None]:
px.histogram(np.log(data_df['P2'].iloc[:1_000_000]) / 2)

In [None]:
px.histogram(np.log(data_df['P1'].iloc[:1_000_000]))

In [None]:
data_df = pd.read_csv('./data.csv', sep = ';',
                      #nrows = 10_000_000, 
                      usecols = ['Id', 'Minute', 'Score1', 'Score2', 
                                 'A1', 'A2', 'DA1', 'DA2', 'Pos1', 'Pos2', 'Off1', 
                                 'Off2', 'On1', 'On2', 'YC1', 'YC2', 'RC1', 'RC2', 
                                 'Sub1', 'Sub2', 'Pen1', 'Pen2', 'Cor1', 'Cor2'])

In [None]:
data_df.head(100)

In [None]:
!head /content/info.csv

Id;BeginTime;Home;Away;League;Country;Type;Result1;Result2;Periods;HomeId;AwayId;LeagueId;Season;Round
358904;01.01.2018 22:45:00;Bromley;Ebbsfleet United;England National League;gb;;4;2;3:1,4:2;;;;;
330509;01.01.2018 22:00:00;Hapoel Bnei Sakhnin;Maccabi Tel Aviv;Israel Premier League;il;;0;1;0:0,0:1;;;;;
575487;01.01.2018 20:30:00;Aston Villa;Bristol City;England Championship;gb;;5;0;2:0,5:0;;;;;
338563;01.01.2018 20:30:00;Everton;Man Utd;England Premier League;gb;;0;2;0:0,0:2;;;;;
340310;01.01.2018 20:15:00;Bangor City;Llandudno;Wales Premier League;gb;;0;1;0:0,0:1;;;;;
577973;01.01.2018 20:13:49;JKU FC;Taifa Jang’ombe;World Club Friendlies;;;0;0;0:0,0:0;;;;;
577112;01.01.2018 20:00:00;Hapoel Kfar Saba;Hapoel Ramat Gan;Israel Leumit Liga;il;;0;0;0:0,0:0;;;;;
571744;01.01.2018 20:00:00;Maccabi Achi Nazareth;Beitar Tel Aviv Bat Yam;Israel Leumit Liga;il;;0;0;0:0,0:0;;;;;
571518;01.01.2018 20:00:00;Ironi Ramat Hasharon;Hapoel Hadera;Israel Leumit Liga;il;;0;4;0:2,0:4;;;;;


In [None]:
info_df = pd.read_csv('./info.csv', sep = ';', usecols = ['Id', 'Result1', 'Result2', 'Periods'])

In [None]:
info_df[['Period1', 'Period2', 'Period3', 'Period4']] = info_df['Periods'].str.split(',', expand = True)

In [None]:
miss_p1_list = list(info_df[info_df['Period1'] == '']['Id'])

In [None]:
period1_result_dict = info_df.set_index('Id')['Period1'].to_dict()

In [None]:
data_df.loc[:, 'periods1'] =  [period1_result_dict[id] if id in period1_result_dict else '' for id in data_df['Id'].values]


In [None]:
(data_df['periods1'] == '').sum()

4247

In [None]:
!head /content/prem.csv

Id;IsPrem;P1;PX;P2;PR;PRTime;IsKO;KO1;KOX;KO2;KOR;KOTime;KOminute;KOscore
358904;1;2.6;3.2;2.8;0.0543;01.01.2018 22:20:17;0;;;;;;;
330509;1;9.5;4.5;1.364;0.0606;01.01.2018 21:58:29;0;;;;;;;
575487;1;2.4;3.3;3.3;0.0227;01.01.2018 20:09:47;0;;;;;;;
338563;1;5.75;3.75;1.7;0.0288;01.01.2018 19:51:18;0;;;;;;;
340310;1;1.571;4;5.75;0.0605;01.01.2018 20:03:11;0;;;;;;;
577973;1;2;3.25;3.3;0.1107;31.12.2017 20:10:35;0;;;;;;;
577112;1;1.8;3.4;4.333;0.0805;01.01.2018 18:14:51;0;;;;;;;
571744;1;2.45;3.2;2.7;0.091;01.01.2018 12:36:33;0;;;;;;;
571518;1;1.909;3.3;3.75;0.0935;01.01.2018 19:37:02;0;;;;;;;


In [None]:
prem_df = pd.read_csv('./prem.csv', sep = ';')

In [None]:
prem_df

Unnamed: 0,Id,IsPrem,P1,PX,P2,PR,PRTime,IsKO,KO1,KOX,KO2,KOR,KOTime,KOminute,KOscore
0,358904,1,2.600,3.200,2.800,0.0543,01.01.2018 22:20:17,0.0,,,,,,,
1,330509,1,9.500,4.500,1.364,0.0606,01.01.2018 21:58:29,0.0,,,,,,,
2,575487,1,2.400,3.300,3.300,0.0227,01.01.2018 20:09:47,0.0,,,,,,,
3,338563,1,5.750,3.750,1.700,0.0288,01.01.2018 19:51:18,0.0,,,,,,,
4,340310,1,1.571,4.000,5.750,0.0605,01.01.2018 20:03:11,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749195,6058161,1,2.750,4.000,2.000,0.1136,26.12.2022 9:54:23,1.0,2.875,4.0,2.0,0.0978,26.12.2022 10:01:18,1.0,0.0
749196,5404664,1,2.800,3.600,2.450,0.0431,26.12.2022 9:59:53,1.0,2.750,3.6,2.5,0.0414,26.12.2022 10:01:30,1.0,0.0
749197,6063457,1,1.166,6.000,11.000,0.1152,26.12.2022 7:39:37,1.0,1.181,6.0,10.0,0.1134,26.12.2022 7:42:01,1.0,0.0
749198,6063456,1,1.083,8.500,23.000,0.0845,26.12.2022 0:36:08,1.0,1.090,8.5,21.0,0.0827,26.12.2022 5:01:48,1.0,0.0


In [None]:
prem_df[prem_df['IsPrem'] != 1]