<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/EDA/football_Dataset_preparation_for_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data from 31.10.2022

### Installations

In [1]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune

def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [2]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.3 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


### Downloads

In [3]:
data_version = 'data_221101/'
username, api_key = get_credential()
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'raw_data'].download('./results.rar')
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [4]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)

from glob import glob
from tqdm import tqdm
import functools
import subprocess

1.3.5


In [6]:
import gensim
from gensim.models import Word2Vec
print(gensim.__version__)

4.2.0


### Code

#### Load Dataset in DataFrame

In [7]:
def run_bash(bashCommand:str, nameCommand = ''):
        process = subprocess.Popen([bashCommand], 
                           shell=True)
        _, error = process.communicate()
        if error:
            print(f'{nameCommand} error:\n', error)

In [9]:
bashCommand = f"""
mkdir -p ./calcio/results
unrar e ./results.rar ./calcio/results/
"""
run_bash(bashCommand, 'tar_wv') #word2vec_vs16_sg1.wordvectors.tar.gz

In [10]:
%%time
start_date = '2008-01-01'
end_date = '2022-09-30'
data_csv_list = ['./calcio/results/' + str(dd).replace('-', '') + '.csv' 
                 for dd in pd.date_range(start=start_date, end=end_date).date]
data_df = pd.concat(map(functools.partial(pd.read_csv, sep=';', compression=None),data_csv_list), ignore_index = True)

CPU times: user 29.3 s, sys: 3.81 s, total: 33.1 s
Wall time: 34.2 s


In [11]:
dups_list = list(data_df.Id.value_counts().index[data_df.Id.value_counts() > 1])

In [12]:
for dup in dups_list:
    data_df.Id[data_df.Id == dup] = [id + str(cnt) for cnt, id in enumerate(data_df.Id[data_df.Id == dup])]

In [13]:
data_df['timestamp'] = pd.to_datetime(data_df['BeginTime'], dayfirst = True).astype('int64') // 10**9

  data_df['timestamp'] = pd.to_datetime(data_df['BeginTime'], dayfirst = True).astype('int64') // 10**9


In [14]:
data_df[['date', 'times_ext']] = data_df['BeginTime'].str.split(expand = True)
data_df = data_df.drop(['BeginTime', 'times_ext'], axis = 'columns')

#### Refactoring DataFrame

In [15]:
data_df['sum_score'] = data_df['Result1'] + data_df['Result2'] # Сумма голов в матче
# Приводим все счета к сумме мячей в матче не больше 10 (adj)
data_df['sum_score_k'] = [1 if score_k < 11 else score_k / 10 for score_k in data_df['sum_score']]
data_df['home_score_adj'] = (data_df['Result1'] / data_df['sum_score_k']).astype(int)
data_df['away_score_adj'] = (data_df['Result2'] / data_df['sum_score_k']).astype(int)
data_df['score_adj'] = data_df['home_score_adj'].astype(str) + '-' + data_df['away_score_adj'].astype(str)

#### Refactoring DataFrame

In [16]:
winners_list = []
for hm, aw in tqdm(data_df[['Result1', 'Result2']].values):
    if hm > aw:
        winners_list.append('H')
    elif hm < aw:
        winners_list.append('A')
    else:
        winners_list.append('D')
data_df['Winner'] = winners_list

100%|██████████| 2703623/2703623 [00:03<00:00, 697868.20it/s]


In [17]:
data_df.HomeId = data_df.HomeId.astype(int)
data_df.AwayId = data_df.AwayId.astype(int)

In [18]:
data_df = data_df.reset_index(drop = True)

In [19]:
print(f'Разнообразие результатов: {len(data_df.score_adj.value_counts())}')
print(f'Количество лиг: {len(data_df.League.value_counts())}')
print(f'Количество сезонов: {len(data_df.Season.value_counts())}')
print(f'Разнообразие типов встреч: {len(data_df.Round.value_counts())}')
print(f'Количество стран: {len(data_df.Country.value_counts())}')
print(f'Повторно лиги: {len(data_df.League.value_counts())}')

Разнообразие результатов: 66
Количество лиг: 1968
Количество сезонов: 263
Разнообразие типов встреч: 2403
Количество стран: 202
Повторно лиги: 1968


#### Расчитываем время прошедшее от прошлого матча и делаем словарик связей матчей

In [20]:
data_df = data_df.sort_values(by = 'timestamp').reset_index(drop = True)
timestamp7days = 7 * 24 * 60 * 60

In [21]:
team_GId_dict = {}
zero_diff = 0
for info in tqdm(
    zip(
        data_df.timestamp,
        data_df.HomeId,
        data_df.AwayId,
        data_df.Id,
    ),
    total=len(data_df),
                    ):
    time_stamp = info[0]
    homeID = info[1]
    awayID = info[2]
    matchID = info[3]
    # 1. Проверить если ID команды в словаре, если нет перейти к добавлению
    if homeID in team_GId_dict:
        # 2. Проверить время, если время позднее последнего добаления,
        # то можно просто присоединить снизу, инфо о последнем матче команды,
        # иначе перейти во вставку матча между матчами
        if matchID not in team_GId_dict[homeID]:
            if time_stamp >= team_GId_dict[homeID]["last_time"]:
                # 3. Добавление матча для команды в словарь
                previous_num = team_GId_dict[homeID]["last_index"]
                previous_time = team_GId_dict[homeID]["last_time"]
                # 4. Обновление блока последнего матча для команды
                team_GId_dict[homeID]["last_index"] = matchID
                team_GId_dict[homeID]["last_time"] = time_stamp
                # 3. Добавление матча для команды в словарь
                team_GId_dict[homeID].update(
                    {
                        matchID: [
                            previous_num,
                            time_stamp,
                        ]
                    }
                )
            else:
                zero_diff += 1
                # 5. Поиск точки вхождение для матча, который оказался не новым
                current_index = team_GId_dict[homeID]["last_index"]
                previous_match_time = team_GId_dict[homeID][current_index][1]
                previous_index = team_GId_dict[homeID][current_index][0]
                while (info[0] < previous_match_time) & (previous_index != -1):
                    current_index = previous_index
                    previous_index = team_GId_dict[homeID][previous_index][0]
                    previous_match_time = team_GId_dict[homeID][current_index][1]
                # 6. Вставка матча и обновление соседних 2 матчей
                team_GId_dict[homeID].update(
                    {matchID: [team_GId_dict[homeID][current_index][0], time_stamp]}
                )
                team_GId_dict[homeID].update(
                    {current_index: [matchID, time_stamp]}
                )

    else:
        # 3. Добавление матча для команды в словарь. Новая команда
        team_GId_dict[homeID] = {"last_index": info[3]}
        team_GId_dict[homeID].update({"last_time": time_stamp})
        team_GId_dict[homeID].update(
            {
                matchID: [
                    -1,
                    time_stamp - timestamp7days
                ]
            }
        )

    #############################################################################
    #############################################################################

    if awayID in team_GId_dict:
        # 2. Проверить время, если время позднее последнего добаления,
        # то можно просто присоединить снизу, инфо о последнем матче команды,
        # иначе перейти во вставку матча между матчами
        if matchID not in team_GId_dict[awayID]:
            if time_stamp >= team_GId_dict[awayID]["last_time"]:
                # 3. Добавление матча для команды в словарь
                previous_num = team_GId_dict[awayID]["last_index"]
                previous_time = team_GId_dict[awayID]["last_time"]
                # 4. Обновление блока последнего матча для команды
                team_GId_dict[awayID]["last_index"] = matchID
                team_GId_dict[awayID]["last_time"] = time_stamp
                # 3. Добавление матча для команды в словарь
                team_GId_dict[awayID].update(
                    {
                        matchID: [
                            previous_num,
                            time_stamp,
                        ]
                    }
                )
            else:
                zero_diff += 1
                # 5. Поиск точки вхождение для матча, который оказался не новым
                current_index = team_GId_dict[awayID]["last_index"]
                previous_match_time = team_GId_dict[awayID][current_index][1]
                previous_index = team_GId_dict[awayID][current_index][0]
                while (time_stamp < previous_match_time) & (previous_index != -1):
                    current_index = previous_index
                    previous_index = team_GId_dict[awayID][previous_index][0]
                    previous_match_time = team_GId_dict[awayID][current_index][1]
                # 6. Вставка матча и обновление соседних 2 матчей
                team_GId_dict[awayID].update(
                    {matchID: [team_GId_dict[awayID][current_index][0], time_stamp]}
                )
                team_GId_dict[awayID].update(
                    {current_index: [matchID, time_stamp]}
                )

    else:
        # 3. Добавление матча для команды в словарь. Новая команда
        team_GId_dict[awayID] = {"last_index": info[3]}
        team_GId_dict[awayID].update({"last_time": time_stamp})
        team_GId_dict[awayID].update(
            {
                matchID: [
                    -1,
                    time_stamp - timestamp7days
                ]
            }
        )


100%|██████████| 2703623/2703623 [00:28<00:00, 93561.11it/s] 


In [22]:
data_df['team_rest_home'] = [team_GId_dict[team][idx][1] - team_GId_dict[team][team_GId_dict[team][idx][0]][1]
                             if team_GId_dict[team][idx][0] != -1 else timestamp7days
                             for team, idx in tqdm(zip(data_df.HomeId, data_df.Id), total = len(data_df))]
data_df['team_rest_away'] = [team_GId_dict[team][idx][1] - team_GId_dict[team][team_GId_dict[team][idx][0]][1]
                             if team_GId_dict[team][idx][0] != -1 else timestamp7days
                             for team, idx in tqdm(zip(data_df.AwayId, data_df.Id), total = len(data_df))]

100%|██████████| 2703623/2703623 [00:07<00:00, 378542.02it/s]
100%|██████████| 2703623/2703623 [00:07<00:00, 380970.16it/s]


In [23]:
# Отдых команд разделяем на 3 группы
data_df['team_rest_home_adj'] = [0 if tm < 500_000 else 1 if tm < 1_000_000 else 2 for tm in data_df['team_rest_home']]
data_df['team_rest_away_adj'] = [0 if tm < 500_000 else 1 if tm < 1_000_000 else 2 for tm in data_df['team_rest_away']]

In [24]:
non_regular_slice = \
(data_df.League.str.contains('copa')) | \
(data_df.League.str.contains('coppa')) | \
(data_df.League.str.contains('cup')) | \
(data_df.League.str.contains('final')) | \
(data_df.League.str.contains('friend')) | \
(data_df.League.str.contains('play-off')) | \
(data_df.League.str.contains('qual')) | \
(data_df.League.str.contains('tourn')) | \
(data_df.League.str.contains('pokal'))

In [25]:
data_df['local_match'] = 1
data_df['local_match'][non_regular_slice] = 0

In [59]:
match_cat_dict4 = {}
cnt4 = 1
for home_score in range(11):
    for away_score in range(11):
        for home_place in range(2):
            for regular_match in range(2):
                for rest_time in range(3):
                    if (home_score + away_score) < 11:
                        match_cat_dict4[
                            str(home_score) + '-' + \
                            str(away_score) + ':' + \
                            str(home_place) + ':' + \
                            str(regular_match) + ':' + \
                            str(rest_time)
                                    ] = cnt4
                        cnt4 += 1
none_idx4 = max(match_cat_dict4.values())
print('len_match_cat_dict4: ', len(match_cat_dict4))
print('none_idx4: ', none_idx4)

len_match_cat_dict4:  792
none_idx4:  792


In [27]:
input_list = []
for info in tqdm(zip(data_df['score_adj'], data_df['local_match'],   data_df['team_rest_home_adj']), total = len(data_df)):
    cat_key = info[0] + ':' + '0' + ':' + str(info[1]) + ':' + str(info[2])
    input_list.append(cat_key)
#data_df['home_token3'] = input_list
data_df['home_token4'] = input_list #data_df['home_token3'] + ':' +  data_df['team_rest_home_adj'].astype(str)

100%|██████████| 2703623/2703623 [00:04<00:00, 595079.23it/s]


In [28]:
input_list = []
for info in tqdm(zip(data_df['score_adj'], data_df['local_match'],  data_df['team_rest_away_adj']), total = len(data_df)):
    cat_key = info[0] + ':' + '1' + ':' + str(info[1]) + ':' + str(info[2])
    input_list.append(cat_key)
data_df['away_token4'] = input_list

100%|██████████| 2703623/2703623 [00:04<00:00, 615789.24it/s]


In [29]:
data_df['home_idx'] = [match_cat_dict4[idx] for idx in tqdm(data_df['home_token4'], total = len(data_df))]
data_df['away_idx'] = [match_cat_dict4[idx] for idx in tqdm(data_df['away_token4'], total = len(data_df))]

100%|██████████| 2703623/2703623 [00:01<00:00, 2238850.24it/s]
100%|██████████| 2703623/2703623 [00:01<00:00, 1579334.39it/s]


In [30]:
for info in tqdm(zip(data_df.HomeId, data_df.AwayId, data_df.Id, data_df.home_idx, data_df.away_idx), total = len(data_df)):
    homeID = info[0]
    awayID = info[1]
    matchID = info[2]
    homeIDX = info[3]
    awayIDX = info[4]
    team_GId_dict[homeID][matchID] += [homeIDX]
    team_GId_dict[awayID][matchID] += [awayIDX]

100%|██████████| 2703623/2703623 [00:10<00:00, 263158.31it/s]


#### Кодируем вектор истории произвольной глубины

In [31]:
def idx_recursive(current_team:int, 
                  current_index:int,
                  loop_back:int,
                  main_dict = team_GId_dict,
                  final_list = None) -> list:
    if final_list is None:
        final_list = []
    previous_index = main_dict[current_team][current_index][0]
    if previous_index == -1:
        final_list = [0] * loop_back
        return final_list
        
    previous_idx = main_dict[current_team][previous_index][2]
    loop_back -=1
    if loop_back > 0:    
        final_list = idx_recursive(current_team,
                                   previous_index,
                                   loop_back,
                                   main_dict = main_dict,
                                   final_list = final_list)
    final_list.append(previous_idx)
    return final_list

In [32]:
look_back = 10
input_list = []
for idx in tqdm(zip(data_df.Id, data_df['HomeId']),
                total = len(data_df)):

    input_list.append(  
        idx_recursive(
            idx[1], 
            idx[0],
            look_back,
                      )[::-1])
data_df[[f'home_input_{num}' for num in range(1, 1 + look_back)]] = input_list

100%|██████████| 2703623/2703623 [00:42<00:00, 64152.50it/s]


In [33]:
look_back = 10
input_list = []
for idx in tqdm(zip(data_df.Id, data_df['AwayId']),
                total = len(data_df)):

    input_list.append(  
        idx_recursive(
            idx[1], 
            idx[0],
            look_back,
                      )[::-1])
data_df[[f'away_input_{num}' for num in range(1, 1 + look_back)]] = input_list

100%|██████████| 2703623/2703623 [00:47<00:00, 56983.40it/s]


#### Encoding output

In [34]:
data_df['binary_output'] = 0
data_df['binary_output'][data_df['Winner'] == 'H'] = 1

In [35]:
data_df['class_H'] = 0
data_df['class_D'] = 0
data_df['class_A'] = 0
data_df['class_H'][data_df['Winner'] == 'H'] = 1
data_df['class_D'][data_df['Winner'] == 'D'] = 1
data_df['class_A'][data_df['Winner'] == 'A'] = 1

In [36]:
data_ready_columns = [
         'date',
         'timestamp',
         'Id',
         'local_match',
         'team_rest_home_adj',  
         'home_input_1', 
         'home_input_2', 
         'home_input_3', 
         'home_input_4', 
         'home_input_5', 
         'home_input_6', 
         'home_input_7', 
         'home_input_8', 
         'home_input_9', 
         'home_input_10', 
         'team_rest_away_adj', 
         'away_input_1', 
         'away_input_2', 
         'away_input_3', 
         'away_input_4', 
         'away_input_5', 
         'away_input_6', 
         'away_input_7', 
         'away_input_8', 
         'away_input_9', 
         'away_input_10',
         'binary_output', 
         'class_H',
         'class_D', 
         'class_A']

In [37]:
data_version = 'data_221101/'
username, api_key = get_credential()
upload = False
if upload:
    import pickle
    with open("./team_GId_dict.pickle", "wb") as f:
        pickle.dump(team_GId_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open("./match_cat_dict4.pickle", "wb") as f:
        pickle.dump(match_cat_dict4, f, protocol=pickle.HIGHEST_PROTOCOL)
    data_df[data_ready_columns].to_csv('./data_ready.csv.gz', compression ={'method':'gzip'}, index = None)
    dataset_params = {}
    dataset_params['description'] = ''
    project = neptune.init_project(
        name="scomesse/football", 
        api_token = api_key
        )
    project[data_version + 'team_GId_dict'].upload('./team_GId_dict.pickle')
    project[data_version + 'match_cat_dict4'].upload('./match_cat_dict4.pickle')
    project[data_version + 'data_ready'].upload('./data_ready.csv.gz')
    project[data_version + 'params'] = dataset_params
    project.stop()

####Create new Word2Vec model

In [38]:
word2vec_is_ready = True
if word2vec_is_ready:
    data_version = 'data_221101/'
    username, api_key = get_credential()
    project = neptune.init_project(
        name="scomesse/football", 
        api_token = api_key
        )
    project[data_version + 'word2vec'].download('./word2vec.wordvectors.tar.gz')
    word2vec_params = project[data_version + 'word2vec_params'].fetch()
    project.stop()

    def run_bash(bashCommand:str, nameCommand = ''):
            process = subprocess.Popen([bashCommand], 
                            shell=True)
            _, error = process.communicate()
            if error:
                print(f'{nameCommand} error:\n', error)
    bashCommand = f"""
    tar -zxvf ./word2vec.wordvectors.tar.gz 
    """
    run_bash(bashCommand, 'tar_wv')
    from gensim.models import KeyedVectors
    wv = KeyedVectors.load('./word2vec_vs16_sg1.wordvectors', mmap='r')

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


In [42]:
names =['home_input_1', 
         'home_input_2', 
         'home_input_3', 
         'home_input_4', 
         'home_input_5', 
         'home_input_6', 
         'home_input_7', 
         'home_input_8', 
         'home_input_9', 
         'home_input_10',
         'away_input_1', 
         'away_input_2', 
         'away_input_3', 
         'away_input_4', 
         'away_input_5', 
         'away_input_6', 
         'away_input_7', 
         'away_input_8', 
         'away_input_9', 
         'away_input_10']

In [43]:
# каждый и строку историй для каждой команды в матче переводим в list, условное строковое предложение
# далее для этих предложений или корпуса предложений создаём word2vec модель
corpus = []
for document in tqdm(data_df[names].values, total = len(data_df)):
    #words = [tokenizer[word] for word in document if word != 0]
    words = [word for word in document if word != 0]
    if words:
        corpus.append(words)

100%|██████████| 2703623/2703623 [00:30<00:00, 87248.39it/s] 


<img src="https://api.monosnap.com/file/download?id=lq8rws2WsGIDPxadAfE9VA5lqv5VvK"/>

In [None]:
#@title Create new Word2Vec model
create_model = False #@param {type:"boolean"}

if create_model:
    # Using params from Word2Vec_FastText_Comparison
    #EMBEDDING_DIM = 256
    word2vec_params = {
        #'alpha': 0.05,
        'vector_size': 16,
        'window': 20,
        'epochs': 5,
        'min_count': 1,
        #'sample': 1e-4,
        'sg': 1,
        'hs': 0,
        #'negative': 5,
    }
    model = Word2Vec(corpus, **word2vec_params)
    wv = model.wv
    print("Word2Vec trained model", wv)
    vectors_path = './word2vec.wordvectors'
    wv.save(vectors_path)
    !tar -zcvf {vectors_path}.tar.gz {vectors_path}*

Word2Vec trained model KeyedVectors<vector_size=16, 792 keys>
./word2vec.wordvectors


In [None]:
if create_model:
    data_version = 'data_221101/'
    vectors_path = './word2vec_vs16_sg1.wordvectors'

    username, api_key = get_credential()
    upload = True
    if upload:
        project = neptune.init_project(
            name="scomesse/football", 
            api_token = api_key
            )
        project[data_version + 'word2vec'].upload(f'{vectors_path}.tar.gz')
        project[data_version+ 'word2vec_params'] = word2vec_params
        project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 7 operations to synchronize with Neptune. Do not kill this process.
All 7 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


In [44]:
# создаём матрицу векторов слов из word2vec модели
# а потом вектор слова будем индексировать по словарю из след. ячейки
# (wv.index_to_key) или word ->  слово у нас представлено цифрой
embedding_matrix = np.zeros((len(wv.index_to_key) + 1, word2vec_params['vector_size']))
for num in tqdm(range(len(wv.index_to_key)), total = len(wv.index_to_key)):
    embedding_matrix[num + 1] = wv[wv.index_to_key[num]]

100%|██████████| 792/792 [00:00<00:00, 123835.56it/s]


In [45]:
idx_arr = np.zeros(max(wv.key_to_index) + 1)
for key, value in wv.key_to_index.items():
    idx_arr[key] = value + 1

In [46]:
val_date = pd.to_datetime('2022-01-01').timestamp()
train_date = pd.to_datetime('2019-01-01').timestamp()

In [47]:
val_date = pd.to_datetime('2022-01-01').timestamp()
train_date = pd.to_datetime('2019-01-01').timestamp()
validation_vector = (data_df['timestamp'] > val_date).values
test_vector = ((data_df['timestamp'] < val_date) & (data_df['timestamp'] > train_date)).values
train_vector = (data_df['timestamp'] < train_date).values
print(train_vector.sum(), test_vector.sum(), validation_vector.sum())

1923326 582753 197544


In [48]:
# Делим трейн -тест - валидация
X_train = idx_arr[data_df[names][train_vector].values].astype(int)
X_test = idx_arr[data_df[names][test_vector].values].astype(int)
X_validation = idx_arr[data_df[names][validation_vector].values].astype(int)

# Our target variable
y_train = data_df['binary_output'][train_vector].values
y_test = data_df['binary_output'][test_vector].values
y_validation = data_df['binary_output'][validation_vector].values

# Our target variable
y_class_train = data_df[['class_H', 'class_D', 'class_A']][train_vector].values
y_class_test = data_df[['class_H', 'class_D', 'class_A']][test_vector].values
y_class_validation = data_df[['class_H', 'class_D', 'class_A']][validation_vector].values

In [50]:
# Обрабатываем кэфы, для последующих валидаций
data_version = 'data_221101/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'SwCoefs'].download('./SwCoefs.csv')
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


In [51]:
file_path = './SwCoefs.csv'

In [52]:
perm_df = pd.read_csv(file_path , sep = ';')
perm_df = perm_df.drop_duplicates(subset = ['Id'], keep = False)
perm_df = perm_df[perm_df[['LW1', 'LX', 'LW2']].notna().all(axis = 1)]


In [53]:
production_vector = data_df.Id.isin(perm_df.Id)
X_production = idx_arr[data_df[names][production_vector].values].astype(int)
y_production = data_df['binary_output'][production_vector].values
y_class_production = data_df[['class_H', 'class_D', 'class_A']][production_vector].values

In [54]:
perm_df.set_index('Id').reindex(
                data_df['Id'][production_vector]
                                ).to_csv(f'{file_path.replace(".csv", "")}_filtered.csv')

In [55]:
line_array = perm_df.set_index('Id')[
    ['LW1', 'LX', 'LW2']
            ].reindex(
                data_df['Id'][production_vector]
                                ).values

In [56]:
npz_name = 'dataset'
np.savez_compressed('./' + npz_name, 
                    X_train = X_train, X_test = X_test, X_validation = X_validation,
                    X_production = X_production,
                    y_train = y_train, y_test = y_test, y_validation = y_validation,
                    y_production = y_production,
                    y_class_train = y_class_train, y_class_test = y_class_test,
                    y_class_validation = y_class_validation,
                    y_class_production = y_class_production,
                    Line_production = line_array,
                    embedding_matrix = embedding_matrix
                    )

In [None]:
npz_upload = False
if npz_upload:
    data_version = 'data_221212/'
    project = neptune.init_project(
        name="scomesse/football", 
        api_token = api_key
        )
    project[data_version + npz_name +'_npz'].upload('./' + npz_name + '.npz')
    project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


In [57]:
perm_df = perm_df.set_index('Id').reindex(
                data_df['Id'][production_vector]
                                )
hasprem_vector = (perm_df.HasPrem.values == 'F')
tier2_vector = (perm_df.Tier.values == 2)
tier3_vector = (perm_df.Tier.values == 3)

In [58]:
vector_name = 'vectors'
np.savez_compressed('./' + vector_name, 
                    hasprem_vector = hasprem_vector,
                    tier2_vector = tier2_vector,
                    tier3_vector = tier3_vector
                    )

In [None]:
special_vectors_upload = False:
if special_vectors_upload:
    data_version = 'data_221212/'
    project = neptune.init_project(
        name="scomesse/football", 
        api_token = api_key
        )
    project[data_version + vector_name].upload('./' + vector_name + '.npz')
    project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata
