<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/EDA/football_live_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Project config

In [21]:
try:
    import neptune
    from neptune.utils import stringify_unsupported
except:
    !pip install neptune >> None
    import neptune
    from neptune.utils import stringify_unsupported
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [2]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Downloads

In [6]:
data_version = 'football_live_upd_230510/'
project = neptune.init_project(
    project="scomesse/football",
    api_token = api_key
    )
data_link = project[data_version + 'raw/data'].fetch()
prem_link = project[data_version + 'raw/prem'].fetch()
index_link = project[data_version + 'raw/index'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


In [14]:
!wget -q -O ./data.rar https://getfile.dokpub.com/yandex/get/{data_link} >> None
!mkdir -p ./Result2
!unrar e ./data.rar ./Result2/ >> None

In [15]:
!wget -q -O ./Prem.rar https://getfile.dokpub.com/yandex/get/{prem_link} >> None
!mkdir -p ./Prem2
!unrar e ./Prem.rar ./Prem2/ >> None

In [16]:
!wget -q -O ./Index.rar https://getfile.dokpub.com/yandex/get/{index_link} >> None
!mkdir -p ./Index2
!unrar e ./Index.rar ./Index2/ >> None

### Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.5.3
1.22.4


In [None]:
np.iinfo(np.int8).max, np.iinfo(np.int16).max, np.iinfo(np.int32).max

(127, 32767, 2147483647)

In [None]:
np.finfo(np.float16).precision, np.finfo(np.float32).precision, np.finfo(np.float64).precision

(3, 6, 15)

In [None]:
import os, psutil, time
from glob import glob
from tqdm import tqdm
import functools
import gc

### Code

#####Functions

In [None]:
def define_files_list(data_start:str, folder:str, date_end = '') -> list:
    '''
    data_start, data_end
    folder: ./folder/
    '''
    if date_end == '':
        return sorted(
            [file
            for file in glob(folder + '*')
            if int(file.split('/')[-1].split('.csv')[0]) >= int(date_start)]
            )
    else:
        return sorted(
            [file
            for file in glob(folder + '*')
            if int(file.split('/')[-1].split('.csv')[0]) >= int(date_start) & \
                int(file.split('/')[-1].split('.csv')[0]) <= int(date_end)]
            )

In [None]:
def create_id_dict(date_start, folder, cols, date_end = '') -> dict:
    '''
    '''
    return pd.concat(map(
        functools.partial(
            pd.read_csv, sep=';', usecols =  cols
                            ),
                    define_files_list(date_start, folder, date_end = date_end)
                    ), ignore_index=True).drop_duplicates(subset = ['Id'], keep = 'last').set_index('Id').to_dict(orient = 'index')

In [None]:
def transform_dataset(data_df:pd.DataFrame, remain = [], relative = True):
    new_match_vector = data_df['Id'] != data_df['Id'].shift(1)
    # трансформируем минуты
    data_df['min_norm'] = data_df['Minute'].astype(np.float32) / 50
    print('1. Минуты посчитаны...')
    # трансформируем голы
    data_df[data_df['Score1'].isna() & new_match_vector] = 0
    data_df['Score1_norm'] = data_df['Score1'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score1'] > 3, ['Score1_norm']] = 1.0
    data_df[data_df['Score2'].isna() & new_match_vector] = 0
    data_df['Score2_norm'] = data_df['Score2'].fillna(method = 'ffill').astype(np.float32) / 4
    data_df.loc[data_df['Score2'] > 3, ['Score2_norm']] = 1.0

    data_df['Score_diff'] = data_df['Score1'].astype(np.int16) - data_df['Score2'].astype(np.int16)
    data_df.loc[data_df['Score_diff'] < -4, ['Score_diff']] = -4
    data_df.loc[data_df['Score_diff'] > 4, ['Score_diff']] = 4
    data_df[[f'Score_cat_{n}' for n in range(1, 10)]] = pd.get_dummies(data_df['Score_diff']).values
    data_df['Score_diff'] = data_df['Score_diff'].astype(np.float32) / np.float32(4.0)
    if 'Score1' not in remain:
        data_df = data_df.drop(['Score1', 'Score2'], axis = 1)
    print('2. Голы посчитаны...')
    #трансформируем атаки
    data_df['A1_scaled'] = data_df['A1'].astype(np.float32) / 75
    data_df.loc[data_df['A1'] >= 60, ['A1_scaled']] = (60 + (data_df['A1'] - 60) / 4) / 75
    data_df['A2_scaled'] = data_df['A2'].astype(np.float32) / 75
    data_df.loc[data_df['A2'] >= 60, ['A2_scaled']] = (60 + (data_df['A2'] - 60) / 4) / 75
    # атаки в минуту
    data_df['A1perMIN'] = data_df['A1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A1perMIN'] > 4, ['A1perMIN']] = np.float32(4.0)
    data_df['A2perMIN'] = data_df['A2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['A2perMIN'] > 4, ['A2perMIN']] = np.float32(4.0)
    # динамика атак
    if relative:
        new_match_vector5 = data_df['Id'] != data_df['Id'].shift(5)
        data_df['A1relativ'] = data_df['A1'].astype(np.float32) - data_df['A1'].shift(5).astype(np.float32)
        data_df.loc[new_match_vector5, ['A1relativ']] = np.float32(0.0)
        data_df['A1relativ'] = data_df['A1relativ'].fillna(0)
        data_df.loc[data_df['A1relativ'] > 15, ['A1relativ']] = np.float32(15.)
        data_df['A2relativ'] = data_df['A2'].astype(np.float32) - data_df['A2'].shift(5).astype(np.float32)
        data_df.loc[new_match_vector5, ['A2relativ']] =  np.float32(0.0)
        data_df['A2relativ'] = data_df['A2relativ'].fillna(0)
        data_df.loc[data_df['A2relativ'] > 15, ['A2relativ']] = np.float32(15.)
    if 'A1' not in remain:
        data_df = data_df.drop(['A1', 'A2'], axis = 1)
    print('3. Атаки посчитаны...')
    # трансформируем опасные атаки
    data_df['DA1_scaled'] = data_df['DA1'].astype(np.float32) / 50
    data_df.loc[data_df['DA1'] >= 40, ['DA1_scaled']] = (80 + (data_df['DA1'] - 40) / 3) / 100
    data_df['DA2_scaled'] = data_df['DA2'].astype(np.float32) / 50
    data_df.loc[data_df['DA2'] >= 40, ['DA2_scaled']] = (80 + (data_df['DA2'] - 40) / 3) / 100
    # опасные атаки в минуту
    data_df['DA1perMIN'] = data_df['DA1'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA1perMIN'] > 3, ['DA1perMIN']] = np.float32(3.0)
    data_df['DA2perMIN'] = data_df['DA2'].astype(np.float32) / data_df['Minute'].astype(np.float32)
    data_df.loc[data_df['DA2perMIN'] > 3, ['DA2perMIN']] = np.float32(3.0)
    # динамика опасных атак
    if relative:
        data_df['DA1relativ'] = data_df['DA1'].astype(np.float32) - data_df['DA1'].shift(5).astype(np.float32)
        data_df.loc[new_match_vector5, ['DA1relativ']] = np.float32(0.0)
        data_df['DA1relativ'] = data_df['DA1relativ'].fillna(0)
        data_df.loc[data_df['DA1relativ'] > 10, ['DA1relativ']] = np.float32(10.)
        data_df['DA2relativ'] = data_df['DA2'].astype(np.float32) - data_df['DA2'].shift(5).astype(np.float32)
        data_df.loc[new_match_vector5, ['DA2relativ']] = np.float32(0.0)
        data_df['DA2relativ'] = data_df['DA2relativ'].fillna(0)
        data_df.loc[data_df['DA2relativ'] > 10, ['DA2relativ']] = np.float32(10.)
    if 'DA1' not in remain:
        data_df = data_df.drop(['DA1', 'DA2'], axis = 1)
    if 'Minute' not in remain:
        data_df = data_df.drop(['Minute'], axis = 1)
    print('4. Опасные атаки посчитаны...')
    # трансформируем владение мячом
    data_df[data_df['Pos1'].isna() & new_match_vector] = 0
    data_df['Pos1_cleaned'] = data_df['Pos1'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos1_cleaned'] < 0.2, ['Pos1_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos1_cleaned'] > 0.8, ['Pos1_cleaned']] = np.float32(0.8)
    data_df[data_df['Pos2'].isna() & new_match_vector] = 0
    data_df['Pos2_cleaned'] = data_df['Pos2'].fillna(method = 'ffill').astype(np.float32) /  np.float32(100.0)
    data_df.loc[data_df['Pos2_cleaned'] < 0.2, ['Pos2_cleaned']] = np.float32(0.2)
    data_df.loc[data_df['Pos2_cleaned'] > 0.8, ['Pos2_cleaned']] = np.float32(0.8)
    if 'Pos1' not in remain:
        data_df = data_df.drop(['Pos1', 'Pos2'], axis = 1)
    print('5. Владение мячом посчитпно...')
    # трансформируем удары
    data_df[data_df['Off1'].isna() & new_match_vector] = 0
    data_df['Off1_norm'] = data_df['Off1'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off1_norm'] > 1.0, ['Off1_norm']] = np.float32(1.0)
    data_df[data_df['Off2'].isna() & new_match_vector] = 0
    data_df['Off2_norm'] = data_df['Off2'].fillna(method = 'ffill').astype(np.float32) / np.float32(10.0)
    data_df.loc[data_df['Off2_norm'] > 1.0, ['Off2_norm']] = np.float32(1.0)
    if 'Off1' not in remain:
        data_df = data_df.drop(['Off1', 'Off2'], axis = 1)
    print('6. Удары посчитаны...')
    # трансформируем удары в створ
    data_df[data_df['On1'].isna() & new_match_vector] = 0
    data_df['On1_norm'] = data_df['On1'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On1_norm'] > 1.0, ['On1_norm']] = np.float32(1.0)
    data_df[data_df['On2'].isna() & new_match_vector] = 0
    data_df['On2_norm'] = data_df['On2'].fillna(method = 'ffill').astype(np.float32) / np.float32(5.0)
    data_df.loc[data_df['On2_norm'] > 1.0, ['On2_norm']] = np.float32(1.0)
    if 'On1' not in remain:
        data_df = data_df.drop(['On1', 'On2'], axis = 1)
    print('7. Удары в створ посчитаны...')
    # трансформируем желтые карточки
    data_df[data_df['YC1'].isna() & new_match_vector] = 0
    data_df['YC1_transformed'] = data_df['YC1'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC1_transformed'] > 1.0, ['YC1_transformed']] = np.float32(1.0)
    data_df[data_df['YC2'].isna() & new_match_vector] = 0
    data_df['YC2_transformed'] = data_df['YC2'].fillna(0).astype(np.float32) / np.float32(2.0)
    data_df.loc[data_df['YC2_transformed'] > 1.0, ['YC2_transformed']] = np.float32(1.0)
    if 'YC1' not in remain:
        data_df = data_df.drop(['YC1', 'YC2'], axis = 1)
    print('8. Жёлтые карточки посчитаны...')
    # трансформируем красные карточки
    data_df[data_df['RC1'].isna() & new_match_vector] = 0
    data_df['RC1_transformed'] = data_df['RC1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC1_transformed'] > 1, ['RC1_transformed']] = np.int8(1)
    data_df[data_df['RC2'].isna() & new_match_vector] = 0
    data_df['RC2_transformed'] = data_df['RC2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['RC2_transformed'] > 1, ['RC2_transformed']] = np.int8(1)
    if 'RC1' not in remain:
        data_df = data_df.drop(['RC1', 'RC2'], axis = 1)
    print('9. Красные карточки посчитаны...')
    # трансформируем замены
    data_df[data_df['Sub1'].isna() & new_match_vector] = 0
    data_df['Sub1_transformed'] = data_df['Sub1'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub1_transformed'] > 1, ['Sub1_transformed']] = np.int8(1)
    data_df[data_df['Sub2'].isna() & new_match_vector] = 0
    data_df['Sub2_transformed'] = data_df['Sub2'].fillna(0).astype(np.int8)
    data_df.loc[data_df['Sub2_transformed'] > 1, ['Sub2_transformed']] = np.int8(1)
    if 'Sub1' not in remain:
        data_df = data_df.drop(['Sub1', 'Sub2'], axis = 1)
    print('10. Замены посчитаны...')
    # трансформируем угловык
    data_df[data_df['Cor1'].isna() & new_match_vector] = 0
    data_df['Cor1_transformed'] = data_df['Cor1'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor1_transformed'] > 1.0, ['Cor1_transformed']] = np.float32(1.0)
    data_df[data_df['Cor2'].isna() & new_match_vector] = 0
    data_df['Cor2_transformed'] = data_df['Cor2'].fillna(0).astype(np.float32) / np.float32(6.0)
    data_df.loc[data_df['Cor2_transformed'] > 1.0, ['Cor2_transformed']] = np.float32(1.0)
    if 'Cor1' not in remain:
        data_df = data_df.drop(['Cor1', 'Cor2'], axis = 1)
    print('11. Угловые посчитаны...')
    # трансформируем линию
    data_df['P1_transformed'] = np.log(data_df['P1'], dtype = np.float32) / 2
    data_df['P2_transformed'] = np.log(data_df['P2'], dtype = np.float32) / 2
    if 'P1' not in remain:
        data_df = data_df.drop(['P1', 'P2'], axis = 1)
    print('12. Линии посчитаны...')
    if 'Pen1' not in remain:
        data_df = data_df.drop(['Pen1', 'Pen2'], axis = 1)
    if 'Active' not in remain:
        data_df = data_df.drop(['Active'], axis = 1)
    return data_df

#### Predefine data

In [None]:
import datetime
data_types_dict = {
    'Id':np.int32,
    'StatTime':str,
    'Minute':np.int8,
    'Active': np.int8, 'Score1':np.int8, 'Score2':np.int8,
    'A1':np.int16, 'A2':np.int16, 'DA1':np.int16, 'DA2':np.int16, 'Pos1':np.float32, 'Pos2':np.float32,
    'Off1':np.int8, 'Off2':np.int8, 'On1':np.int8, 'On2':np.int8, 'YC1':np.int8, 'YC2':np.int8,
    'RC1':np.int8, 'RC2':np.int8, 'Sub1':np.int8, 'Sub2':np.int8, 'Pen1':np.int8, 'Pen2':np.int8,
    'Cor1':np.int8, 'Cor2':np.int8, 'Period':np.int8,
    'D':np.datetime64,
    'I':np.int32, 'Active.1':np.int8,
    'Time':np.datetime64,
    'Minute.1':np.int8,
    'RawTime':np.datetime64,
    'Score1.1':np.int8, 'Score2.1':np.int8, 'Period.1':np.int8,
    'W1':np.float16, 'WX':np.float16, 'W2':np.float16, 'X1':np.float16, 'X2':np.float16, 'W12':np.float16, 'TotalValue':np.float16,
    'Over':np.float16, 'Under':np.float16, 'Hand1Value':np.float16, 'H1':np.float16, 'H2':np.float16
}

In [None]:
k_cols = [
    'W1', 'WX', 'W2', 'X1', 'X2', 'W12', 'TotalValue' ,'Over',
    'Under', 'Hand1Value', 'H1', 'H2'
]

In [None]:
drop_cols = [
    #'StatTime',
    'Comment', 'D', 'I', 'Time', 'Minute.1', 'RawTime',
    'Score1.1', 'Score2.1', 'Period.1', 'Period', 'Periods', 'Serve'
]

#### Load Parameters

In [None]:
info_cols = ['Id', 'Result1', 'Result2']
prem_cols = ['Id', 'P1', 'PX', 'P2']
res_cols = [cols for cols in data_types_dict if cols not in drop_cols]

info_folder = './Index2/'
prem_folder = './Prem2/'
data_folder = './Result2/'

In [None]:
### собрать датасет с даты:
year_start = '2023'
month_start = '01'
day_start = '01'
### до даты
year_end = ''
month_end = ''
day_end = ''
date_start = year_start + month_start + day_start
date_end = year_end + month_end + day_end

#### Create DataFrame

In [None]:
info_dict = create_id_dict(date_start, info_folder, info_cols)
prem_dict = create_id_dict(date_start, prem_folder, prem_cols)
if len(prem_dict) != len(info_dict):
    print('Нет в info: ', set(prem_dict) - set(info_dict))
    print('Нет в prem: ', set(info_dict) - set(prem_dict))

In [None]:
files_list =['./Result2/' + str(id) + '.csv'
    for id in set(info_dict)
    if os.path.exists('./Result2/' + str(id) + '.csv')]

In [None]:
time_point1 = time.time()
data_df = pd.concat(map(functools.partial(
    pd.read_csv,
    sep = ';',
    #parse_dates = ['StatTime'],
    usecols = res_cols
                                    ), files_list), ignore_index=True)
time_point2 = time.time()
print(f'create dataframe: {time_point2 - time_point1} sec.')
#---------------
process = psutil.Process(os.getpid())
print(round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

data_df[k_cols] = data_df[k_cols].fillna(0).values # Заполняем пропущенные лайв кэфы 1
new_match_vector = data_df['Id'] != data_df['Id'].shift(1) # Отмечаем переходы между матчами в колонуах
data_cols = list(set(res_cols) - set(k_cols)) # Отбирвем колонки для заполнения пропусков метдом ffill
for col in data_cols:
    data_df.loc[data_df[col].isna() & new_match_vector, col] = 0 # Сами переходы с пропусками заполняем 0
data_df = data_df.fillna(method = 'ffill')
################ Записываем файл с временными метками и сбрасываем время матчей
#(data_df['Id'].astype(str) + ':' + data_df['StatTime'].astype(str).str.split(
#        ' ', expand=True)[0]
#        ).drop_duplicates().str.split(':', expand = True).rename(
#                                                columns = {0:'Id', 1:'StatTime'}
#                                                            ).to_csv('./time.csv', index = False)
#data_df = data_df.drop(['StatTime'], axis = 1)
############################################### Присоединяем результаты и кэфы
data_df[['Result1', 'Result2']] = [
    [info_dict[id]['Result1'], info_dict[id]['Result2']]
    for id in tqdm(data_df['Id'].values, total = len(data_df))
    ]
data_df[['P1',
         #'PX',
         'P2']] = [
    [
        prem_dict[id]['P1'],
        #prem_dict[id]['PX'],
        prem_dict[id]['P2']]
    for id in tqdm(data_df['Id'].values, total = len(data_df))
    ]

print('\n')
#---------------
process = psutil.Process(os.getpid())
print(round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

create dataframe: 125.07413363456726 sec.
2.34 GiB


100%|██████████| 1224315/1224315 [00:02<00:00, 580268.50it/s]
100%|██████████| 1224315/1224315 [00:02<00:00, 579360.59it/s]




2.37 GiB


In [None]:
data_df.head()

Unnamed: 0,Id,StatTime,Minute,Active,Score1,Score2,A1,A2,DA1,DA2,Pos1,Pos2,Off1,Off2,On1,On2,YC1,YC2,RC1,RC2,Sub1,Sub2,Pen1,Pen2,Cor1,Cor2,Active.1,W1,WX,W2,X1,X2,W12,TotalValue,Over,Under,Hand1Value,H1,H2,Result1,Result2,P1,P2
0,6291479,18.02.2023 12:50:44,1,1,0,0,1,1,0,1,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,2.05,3.7,3.06,1.33,1.68,1.23,3.0,1.9,1.88,0.0,1.58,2.33,0,1,2.0,3.1
1,6291479,18.02.2023 12:52:04,2,1,0,0,3,2,0,1,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,2.06,3.68,3.05,1.33,1.67,1.24,3.0,1.93,1.85,0.0,1.58,2.33,0,1,2.0,3.1
2,6291479,18.02.2023 12:53:25,4,1,0,0,4,2,1,1,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,2.07,3.6,3.08,1.32,1.666,1.24,3.0,2.07,1.74,0.0,1.58,2.33,0,1,2.0,3.1
3,6291479,18.02.2023 12:54:58,5,1,0,0,5,3,1,2,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,2.08,3.58,3.08,1.32,1.66,1.25,3.0,2.13,1.7,0.0,1.58,2.33,0,1,2.0,3.1
4,6291479,18.02.2023 12:55:58,6,1,0,0,5,5,1,4,0.0,0.0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,2.08,3.55,3.1,1.32,1.66,1.25,2.5,1.7,2.13,0.0,1.59,2.31,0,1,2.0,3.1


#### create time.csv

In [None]:
(data_df['Id'].astype(str) + ':' + data_df['StatTime'].astype(str).str.split(
        ' ', expand=True)[0]
        ).drop_duplicates().str.split(':', expand = True).rename(
                                                columns = {0:'Id', 1:'StatTime'}
                                                            ).to_csv('./time_upd.csv', index = False)
data_df = data_df.drop(['StatTime'], axis = 1)

In [None]:
len(data_df)

1224315

#### transform dataset

In [None]:
data_df.loc[new_match_vector & (data_df['P1'].isna()), ['P1']] = data_df['W1'][new_match_vector & (data_df['P1'].isna())]
data_df.loc[new_match_vector & (data_df['P2'].isna()), ['P2']] = data_df['W2'][new_match_vector & (data_df['P2'].isna())]
data_df['P1'] = data_df['P1'].fillna(method = 'ffill').values
data_df['P2'] = data_df['P2'].fillna(method = 'ffill').values

In [None]:
data_df = transform_dataset(
    data_df,
    remain = ['Score1', 'Score2'],
    relative = False
    )

1. Минуты посчитаны...
2. Голы посчитаны...
3. Атаки посчитаны...
4. Опасные атаки посчитаны...
5. Владение мячом посчитпно...
6. Удары посчитаны...
7. Удары в створ посчитаны...
8. Жёлтые карточки посчитаны...
9. Красные карточки посчитаны...
10. Замены посчитаны...
11. Угловые посчитаны...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


12. Линии посчитаны...


In [None]:
# Из-за того что некоторые кэфы были пропущены, и заменены на нули в препроцессинге на их месте появились inf, удаляем
print('P1 -inf & P2 -inf: ', (data_df['P1_transformed'] == -np.inf).sum(), (data_df['P2_transformed'] == -np.inf).sum())
data_df = data_df[~(data_df['P1_transformed'] == -np.inf)]
data_df = data_df[~(data_df['P2_transformed'] == -np.inf)]

print('P1 NaN & P2 NaN: ', data_df['P1_transformed'].isna().sum(), data_df['P2_transformed'].isna().sum())

data_df = data_df[~data_df['P1_transformed'].isna()]
data_df = data_df[~data_df['P2_transformed'].isna()]

print('Score-Result error 1&2: ', (~(data_df['Result1'] - data_df['Score1']) >= 0).sum(), (~(data_df['Result1'] - data_df['Score1']) >= 0).sum())
data_df = data_df.loc[(data_df['Result1'] - data_df['Score1']) >= 0]
data_df = data_df.loc[(data_df['Result2'] - data_df['Score2']) >= 0]


#---------------
process = psutil.Process(os.getpid())
print('mem usage: ', round(process.memory_info().rss / 1024 ** 3, 2), 'GiB')  # in bytes
#---------------

P1 -inf & P2 -inf:  1054 1054
P1 NaN & P2 NaN:  0 0
Score-Result error 1&2:  20 20
mem usage:  3.04 GiB


In [None]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1223240 entries, 0 to 1224314
Data columns (total 55 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Id                1223240 non-null  int64  
 1   Score1            1223240 non-null  int64  
 2   Score2            1223240 non-null  int64  
 3   Active.1          1223240 non-null  int64  
 4   W1                1223240 non-null  float64
 5   WX                1223240 non-null  float64
 6   W2                1223240 non-null  float64
 7   X1                1223240 non-null  float64
 8   X2                1223240 non-null  float64
 9   W12               1223240 non-null  float64
 10  TotalValue        1223240 non-null  float64
 11  Over              1223240 non-null  float64
 12  Under             1223240 non-null  float64
 13  Hand1Value        1223240 non-null  float64
 14  H1                1223240 non-null  float64
 15  H2                1223240 non-null  float64
 16  

In [None]:
data_df.describe()

Unnamed: 0,Id,Score1,Score2,Active.1,W1,WX,W2,X1,X2,W12,TotalValue,Over,Under,Hand1Value,H1,H2,Result1,Result2,min_norm,Score1_norm,Score2_norm,Score_diff,Score_cat_1,Score_cat_2,Score_cat_3,Score_cat_4,Score_cat_5,Score_cat_6,Score_cat_7,Score_cat_8,Score_cat_9,A1_scaled,A2_scaled,A1perMIN,A2perMIN,DA1_scaled,DA2_scaled,DA1perMIN,DA2perMIN,Pos1_cleaned,Pos2_cleaned,Off1_norm,Off2_norm,On1_norm,On2_norm,YC1_transformed,YC2_transformed,RC1_transformed,RC2_transformed,Sub1_transformed,Sub2_transformed,Cor1_transformed,Cor2_transformed,P1_transformed,P2_transformed
count,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0,1223240.0
mean,5985329.0,0.2983863,0.2400453,0.8742266,3.043663,3.608408,4.61347,1.32912,1.769868,1.057759,2.353216,1.706996,1.684816,-0.2539914,1.738253,1.744618,1.532956,1.22513,0.4584769,0.07446658,0.05996881,0.01450574,0.0006605409,0.00355286,0.01980887,0.1252354,0.6597928,0.1545191,0.02904908,0.005965305,0.001415912,0.3275578,0.3114248,1.106338,1.051965,0.2689399,0.2417156,0.5842383,0.5246256,0.4294243,0.4055518,0.1430056,0.1225209,0.1977681,0.1674516,0.1118444,0.1244981,0.006077303,0.006971649,0.02663337,0.028279,0.1890749,0.1605529,0.4223377,0.5986543
std,426331.2,0.5944298,0.5271992,0.3315939,4.17803,2.543143,5.886139,1.284099,1.670416,0.4787107,1.253852,0.622347,0.61073,1.185612,0.7098662,0.7122908,1.402802,1.248968,0.2593684,0.1476283,0.1314469,0.1921436,0.02569251,0.05949991,0.1393431,0.3309858,0.4737789,0.3614458,0.1679442,0.07700471,0.03760197,0.2066836,0.1981929,0.4363853,0.4246509,0.1995743,0.1850384,0.323683,0.308598,0.176726,0.1652045,0.1633827,0.1464586,0.2412493,0.2208958,0.2444014,0.2560772,0.07771984,0.08320487,0.1610095,0.1657689,0.2247133,0.205277,0.2548604,0.2976166
min,4975839.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0009989885,0.002493768
25%,5539179.0,0.0,0.0,1.0,1.3,2.78,1.57,1.022,1.104,1.098,1.5,1.74,1.72,-1.0,1.58,1.59,1.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.1466667,0.8333333,0.7894737,0.1,0.1,0.368421,0.3225806,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2552128,0.4054651
50%,6140226.0,0.0,0.0,1.0,2.05,3.32,2.89,1.222,1.46,1.24,2.5,1.88,1.85,0.0,1.83,1.84,1.0,1.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.3066667,0.2933333,1.046512,1.0,0.24,0.2,0.5454546,0.5,0.46,0.42,0.1,0.1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666667,0.1666667,0.3827339,0.565701
75%,6320578.0,0.0,0.0,1.0,3.14,4.15,5.15,1.533,2.05,1.33,3.0,2.03,2.0,0.0,2.11,2.12,2.0,2.0,0.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.4666667,0.44,1.322581,1.25,0.38,0.34,0.7619048,0.6888889,0.56,0.53,0.2,0.2,0.4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333,0.3333333,0.5323554,0.7520387
max,6536463.0,8.0,6.0,1.0,69.0,26.0,126.0,18.5,18.5,2.13,12.0,10.5,3.1,10.5,6.1,10.5,13.0,14.0,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.03,1.013333,4.0,4.0,0.99,0.9433333,3.0,3.0,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.102346,2.197225


In [None]:
data_df.columns

Index(['Id', 'Score1', 'Score2', 'Active.1', 'W1', 'WX', 'W2', 'X1', 'X2',
       'W12', 'TotalValue', 'Over', 'Under', 'Hand1Value', 'H1', 'H2',
       'Result1', 'Result2', 'min_norm', 'Score1_norm', 'Score2_norm',
       'Score_diff', 'Score_cat_1', 'Score_cat_2', 'Score_cat_3',
       'Score_cat_4', 'Score_cat_5', 'Score_cat_6', 'Score_cat_7',
       'Score_cat_8', 'Score_cat_9', 'A1_scaled', 'A2_scaled', 'A1perMIN',
       'A2perMIN', 'DA1_scaled', 'DA2_scaled', 'DA1perMIN', 'DA2perMIN',
       'Pos1_cleaned', 'Pos2_cleaned', 'Off1_norm', 'Off2_norm', 'On1_norm',
       'On2_norm', 'YC1_transformed', 'YC2_transformed', 'RC1_transformed',
       'RC2_transformed', 'Sub1_transformed', 'Sub2_transformed',
       'Cor1_transformed', 'Cor2_transformed', 'P1_transformed',
       'P2_transformed'],
      dtype='object')

In [None]:
data_df

Unnamed: 0,Id,Score1,Score2,Active.1,W1,WX,W2,X1,X2,W12,TotalValue,Over,Under,Hand1Value,H1,H2,Result1,Result2,min_norm,Score1_norm,Score2_norm,Score_diff,Score_cat_1,Score_cat_2,Score_cat_3,Score_cat_4,Score_cat_5,Score_cat_6,Score_cat_7,Score_cat_8,Score_cat_9,A1_scaled,A2_scaled,A1perMIN,A2perMIN,DA1_scaled,DA2_scaled,DA1perMIN,DA2perMIN,Pos1_cleaned,Pos2_cleaned,Off1_norm,Off2_norm,On1_norm,On2_norm,YC1_transformed,YC2_transformed,RC1_transformed,RC2_transformed,Sub1_transformed,Sub2_transformed,Cor1_transformed,Cor2_transformed,P1_transformed,P2_transformed
0,6291479,0,0,1,2.050,3.70,3.06,1.33,1.680,1.230,3.0,1.900,1.88,0.0,1.580,2.33,0,1,0.02,0.00,0.0,0.00,0,0,0,0,1,0,0,0,0,0.013333,0.013333,1.000000,1.000000,0.00,0.02,0.000000,1.000000,0.20,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.000000,0.000000,0.346574,0.565701
1,6291479,0,0,1,2.060,3.68,3.05,1.33,1.670,1.240,3.0,1.930,1.85,0.0,1.580,2.33,0,1,0.04,0.00,0.0,0.00,0,0,0,0,1,0,0,0,0,0.040000,0.026667,1.500000,1.000000,0.00,0.02,0.000000,0.500000,0.20,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.000000,0.000000,0.346574,0.565701
2,6291479,0,0,1,2.070,3.60,3.08,1.32,1.666,1.240,3.0,2.070,1.74,0.0,1.580,2.33,0,1,0.08,0.00,0.0,0.00,0,0,0,0,1,0,0,0,0,0.053333,0.026667,1.000000,0.500000,0.02,0.02,0.250000,0.250000,0.20,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.000000,0.000000,0.346574,0.565701
3,6291479,0,0,1,2.080,3.58,3.08,1.32,1.660,1.250,3.0,2.130,1.70,0.0,1.580,2.33,0,1,0.10,0.00,0.0,0.00,0,0,0,0,1,0,0,0,0,0.066667,0.040000,1.000000,0.600000,0.02,0.04,0.200000,0.400000,0.20,0.20,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.000000,0.000000,0.346574,0.565701
4,6291479,0,0,1,2.080,3.55,3.10,1.32,1.660,1.250,2.5,1.700,2.13,0.0,1.590,2.31,0,1,0.12,0.00,0.0,0.00,0,0,0,0,1,0,0,0,0,0.066667,0.066667,0.833333,0.833333,0.02,0.08,0.166667,0.666667,0.20,0.20,0.0,0.0,0.0,0.2,0.0,0.0,0,0,0,0,0.000000,0.166667,0.346574,0.565701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1224310,6160370,1,0,1,1.052,9.20,29.00,0.00,7.100,1.017,2.5,1.880,1.88,-2.0,2.130,1.68,1,0,0.82,0.25,0.0,0.25,0,0,0,0,0,1,0,0,0,0.746667,0.560000,1.365854,1.024390,0.52,0.36,0.634146,0.439024,0.58,0.42,0.3,0.2,1.0,0.6,0.0,0.0,0,0,0,0,0.166667,0.000000,0.154844,1.039721
1224311,6160370,1,0,1,1.053,9.10,29.00,0.00,7.000,1.018,2.5,1.940,1.82,-2.0,2.200,1.64,1,0,0.84,0.25,0.0,0.25,0,0,0,0,0,1,0,0,0,0.786667,0.573333,1.404762,1.023810,0.54,0.36,0.642857,0.428571,0.59,0.41,0.3,0.2,1.0,0.6,0.0,0.0,0,0,0,0,0.166667,0.000000,0.154844,1.039721
1224312,6160370,1,0,1,1.053,9.10,29.00,0.00,7.000,1.018,2.5,1.952,1.81,-2.0,2.240,1.62,1,0,0.86,0.25,0.0,0.25,0,0,0,0,0,1,0,0,0,0.786667,0.613333,1.372093,1.069767,0.54,0.42,0.627907,0.488372,0.57,0.43,0.3,0.2,1.0,0.6,0.0,0.0,0,0,0,0,0.166667,0.000000,0.154844,1.039721
1224313,6160370,1,0,1,1.053,9.10,29.00,0.00,7.000,1.018,2.5,1.990,1.78,-1.5,1.615,2.25,1,0,0.88,0.25,0.0,0.25,0,0,0,0,0,1,0,0,0,0.786667,0.613333,1.340909,1.045455,0.54,0.44,0.613636,0.500000,0.57,0.43,0.3,0.2,1.0,0.6,0.0,0.0,0,0,0,0,0.166667,0.000000,0.154844,1.039721


#### split data & prepare files for upload

In [None]:
#%%writefile save_discription_upd.txt
np.savez_compressed('./dataset_upd',
                    X = data_df[data_df.columns[18:]].values,
                    y_bin = np.array((data_df['Result1'] > data_df['Result2']) *1, dtype = np.int8),
                    y_multi = 1 - np.sign((data_df['Result1'] - data_df['Result2']).values),
                    y_diff = (data_df['Result1'] - data_df['Result2']).values,
                    y_regression1 = ((data_df['Result1'] - data_df['Score1']) / 21).values,
                    y_regression2 = ((data_df['Result2'] - data_df['Score2']) / 21).values,
                    score1 = data_df['Score1'].values,
                    score2 = data_df['Score2'].values,
                    result1 = data_df['Result1'].values,
                    result2 = data_df['Result2'].values,
                    K_train = data_df[data_df.columns[3:16]].values,
                )

Writing save_discription_upd.txt


In [None]:
np.savez_compressed('./additional_data_upd',
                    id = data_df['Id'].values,
                    min = data_df['min_norm'].values,
)

In [None]:
params = {}
params['description'] = 'холдаут часть link \n' + \
'4 класса задач, бинарная классификация и мультикласс, и под регрессию 2 варианта без разделения на трейн, тест и холдаут'
params['features'] = list(data_df.columns)

#### upload data to neptune.ai

In [None]:
data_version = 'football_live_upd_230510/'
project = neptune.init_project(
    project="scomesse/football",
    api_token = api_key
    )
project[data_version + 'dataset'].upload('./dataset_upd.npz')
project[data_version + 'description'].upload('./save_discription_upd.txt')
project[data_version + 'additional_data'].upload('./additional_data_upd.npz')
project[data_version + 'info'].upload('/content/Index2/info.csv')
project[data_version + 'time'].upload('./time_upd.csv')
project[data_version + 'params'] = stringify_unsupported(params)
project.stop()

https://app.neptune.ai/scomesse/football/
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 7 operations to synchronize with Neptune. Do not kill this process.


ERROR:neptune.internal.operation_processors.async_operation_processor:Error occurred during asynchronous operation processing: Cannot upload file /content/Index2/info.csv: Path not found or is a not a file.


All 7 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata
