In [None]:
%run ./../utils/_logger.ipynb
%run ./../utils/_preprocess-utils.ipynb

In [None]:
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer

In [None]:
TRF_DATA_DIR = f'{PROC_DATA_DIR}/trf'
TRF_DATA_PATH = f'{TRF_DATA_DIR}/trf_data.np'

TRF_MODEL_NAME = 'distilbert-base-uncased'

SAMPLE_LENGTH = 512

In [None]:
DOCUMENT_FILTERS = (deaccent, lower_to_unicode, strip_tags, strip_multiple_whitespaces)

In [None]:
if not Path(TRF_DATA_PATH).is_file():
    corpus = df[doc_col].values
    tokenizer = AutoTokenizer.from_pretrained(TRF_MODEL_NAME)

    logger.info("Preprocessing corpus...")
    corpus = [apply_filters(doc, filters=DOCUMENT_FILTERS) for doc in tqdm(corpus, disable=SILENT)]

    logger.info("Replacing special characters...")
    corpus = [sub_pattern(doc, pattern=SUB_PATTERN) for doc in tqdm(corpus, disable=SILENT)]

    logger.info("Removing unprintable characters...")
    corpus = [remove_unprintable(doc) for doc in tqdm(corpus, disable=SILENT)]

    logger.info("Tokenizing corpus...")
    corpus = [tokenizer.encode_plus(doc, max_length=SAMPLE_LENGTH, truncation=True, padding='max_length',
                                    add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False)
              for doc in tqdm(corpus, disable=SILENT)]

    trf_input_ids = [sample['input_ids'] for sample in corpus]
    trf_attention_mask = [sample['attention_mask'] for sample in corpus]
    trf_data = np.stack((trf_input_ids, trf_attention_mask), axis=1)

    logger.info("Storing encoded corpus to disk...")
    Path(TRF_DATA_DIR).mkdir(parents=True, exist_ok=True)
    with open(TRF_DATA_PATH, 'wb') as f:
        np.save(f, trf_data)
else:
    with open(TRF_DATA_PATH, 'rb') as f:
        trf_data = np.load(f)