This notebook makes final preparations of filtered data forming training data for base models.

The notebook should serve as a template to produce training data for a specified range of parts expressed by the variable `splits`. Each range results in a training dataset meant for a corresponding base model.

In [1]:
import numpy as np


CALC_TF_IDF = False
splits = np.array_split(range(0, 10), 10) # for model-00
# splits = np.array_split(range(10, 20), 10) # for model-01
# splits = np.array_split(range(20, 30), 10) # for model-02
# splits = np.array_split(range(30, 40), 10) # for model-03
# splits = np.array_split(range(40, 50), 10) # for model-04
# splits = np.array_split(range(50, 60), 10) # for model-05
# splits = np.array_split(range(60, 70), 10) # for model-06

splits

[array([0]),
 array([1]),
 array([2]),
 array([3]),
 array([4]),
 array([5]),
 array([6]),
 array([7]),
 array([8]),
 array([9])]

In [2]:
!pip install rapidfuzz -qq
!pip install transliterate
!pip install -U sentence-transformers

Collecting transliterate
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 571 kB/s 
Installing collected packages: transliterate
Successfully installed transliterate-1.10.2
Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 341 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 562 kB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 353 kB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=121000 sha256=1d6f452a7691c861e74ebf

In [3]:
import pandas as pd

from tqdm import tqdm

from collections import Counter

import gc
import cloudpickle

from sentence_transformers import util
import torch

import os, sys
import random

import warnings

import xgboost as xgb

print('Using \033[34mXGBoost', xgb.__version__, '\033[0m')

Using [34mXGBoost 1.5.0 [0m


## Installing wikimatcher

In [4]:
import sys


!git clone https://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@github.com/basic-go-ahead/wikimatcher.git
sys.path.append('./wikimatcher')

Cloning into 'wikimatcher'...
remote: Enumerating objects: 496, done.[K
remote: Counting objects: 100% (496/496), done.[K
remote: Compressing objects: 100% (342/342), done.[K
remote: Total 496 (delta 329), reused 315 (delta 151), pack-reused 0[K
Receiving objects: 100% (496/496), 70.46 KiB | 403.00 KiB/s, done.
Resolving deltas: 100% (329/329), done.


In [5]:
from wikimatcher.ranking2 import DataPreparator, basic_preprocessing_function

## Main Part

In [6]:
images = pd.read_csv('../input/traindataset-part0-4-count-5/images_part0_between4,5.csv', keep_default_na=False)
images['image_id'] = images.index
matchings = pd.read_csv('../input/combiner-target-dataset-0-of-5/final_matchings.csv')

In [7]:
if 'preparator' in globals():
    del preparator
    gc.collect()

preparator = DataPreparator(
    images=images,
    matchings=matchings,
    part_amount=72,
    samples_per_part=2,
    frequency_format_string='../input/train-filter-part-{0:02d}-36/frequency-{1:02d}.pickle',
    part_format_string='../input/train-filter-part-{0:02d}-36/part-{1:02d}.parquet',
    prefinal_title_sentence_embeddings_path='../input/pagetitle-sentence-embeddings/prefinal_page_title_embeddings.data',
    prefinal_caption_sentence_embeddings_path='../input/caption-sentence-embeddings/prefinal_caption_embeddings.data',
    calc_tf_idf=CALC_TF_IDF
)

## Preparing and Saving Features

In [9]:
def save_features(path, X, y, group, image_ids, target_ids):
    X.to_parquet(os.path.join(path, 'features.parquet'))
    pd.DataFrame(y).to_parquet(os.path.join(path, 'targets.parquet'))
    
    with open(os.path.join(path, 'group.pickle'), 'wb') as file:
        cloudpickle.dump(group, file)
        
    pd.DataFrame(image_ids).to_parquet(os.path.join(path, 'image_ids.parquet'))
    pd.DataFrame(target_ids).to_parquet(os.path.join(path, 'target_ids.parquet'))

In [10]:
%%time

from pathlib import Path
import os


for k, indices in enumerate(splits):
    print('\033[31m({0})\033[0m Handling indices: {1}'.format(k , indices))
    
    path2save = './train-{0:02d}'.format(k)
    Path(path2save).mkdir(exist_ok=True)
    
    X, y, group, image_ids, target_ids, sums = \
        preparator.load_parts(indices, preprocessing_function=basic_preprocessing_function)
    
    if sums is not None:
        print(sums['TF_IDF_SUM'].mean(), sums['TF_IDF_SUM'].median(), sums['TF_IDF_SUM'].std(), sums['TF_IDF_SUM'].min(), sums['TF_IDF_SUM'].max())
    
    save_features(path2save, X, y, group, image_ids, target_ids)
    
    del X
    del y
    del group
    del image_ids
    del target_ids
    
    gc.collect()

[31m(0)[0m Handling indices: [0]
[31m(1)[0m Handling indices: [1]
[31m(2)[0m Handling indices: [2]
[31m(3)[0m Handling indices: [3]
[31m(4)[0m Handling indices: [4]
[31m(5)[0m Handling indices: [5]
[31m(6)[0m Handling indices: [6]
[31m(7)[0m Handling indices: [7]
[31m(8)[0m Handling indices: [8]
[31m(9)[0m Handling indices: [9]
CPU times: user 7min 29s, sys: 2min 32s, total: 10min 1s
Wall time: 9min 16s


In [11]:
%%time

for k in [70, 71]:
    path2save = './valid-{}'.format(k)
    Path(path2save).mkdir(exist_ok=True)

    X, y, group, image_ids, target_ids, sums = \
        preparator.load_parts([k], preprocessing_function=basic_preprocessing_function)
    
    if sums is not None:
        print(sums['TF_IDF_SUM'].mean(), sums['TF_IDF_SUM'].median(), sums['TF_IDF_SUM'].std(), sums['TF_IDF_SUM'].min(), sums['TF_IDF_SUM'].max())

    save_features(path2save, X, y, group, image_ids, target_ids)
    
    del X
    del y
    del group
    del image_ids
    del target_ids
    
    gc.collect()

CPU times: user 1min 34s, sys: 35.1 s, total: 2min 9s
Wall time: 1min 58s


In [12]:
!rm -r ./wikimatcher