In [1]:
"""Script for building a Masked Language Modeling vocabulary for CharacterBERT."""
import os
import logging
import argparse
from collections import Counter

from tqdm import tqdm

In [2]:
WORKDIR = os.environ['WORKDIR']

In [3]:
MLM_VOCABULARY_DIRECTORY = os.path.join(WORKDIR, 'data', 'mlm_vocabularies')
os.makedirs(MLM_VOCABULARY_DIRECTORY, exist_ok=True)

In [9]:
formatted_corpus_path = "data\\formatted\\Bangla_Pretraining_Data.formatted.txt"

In [10]:
max_vocabulary_size = 30522

In [11]:
# Recover corpus name from corpus path
prefix = os.path.basename(os.path.dirname(formatted_corpus_path))
save_path = os.path.join(MLM_VOCABULARY_DIRECTORY, prefix)
os.makedirs(save_path, exist_ok=True)

In [12]:
mlm_vocabulary_fpath = os.path.join(save_path, 'mlm_vocab.txt')

In [13]:
LOGGING_FORMAT = "%(asctime)s | PID: %(process)d | %(filename)s | %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, datefmt="%d/%m/%Y %H:%M:%S", level=logging.INFO)

In [14]:
logging.info('Preparing to build a MLM vocabulary using parameters:')
logging.info(' * formatted_corpus_path: %s', formatted_corpus_path)
logging.info(' * max_vocabulary_size: %s', max_vocabulary_size)
if os.path.exists(mlm_vocabulary_fpath):
    logging.warning('Found existing vocabulary file: %s', mlm_vocabulary_fpath)

22/07/2023 09:51:25 | PID: 19484 | 3816109827.py | INFO - Preparing to build a MLM vocabulary using parameters:
22/07/2023 09:51:25 | PID: 19484 | 3816109827.py | INFO -  * formatted_corpus_path: data\formatted\Bangla_Pretraining_Data.formatted.txt
22/07/2023 09:51:25 | PID: 19484 | 3816109827.py | INFO -  * max_vocabulary_size: 30522


In [15]:
 # Count all the tokens in the corpus
counter = Counter()
logging.info('Reading corpus file: %s', formatted_corpus_path)
with open(formatted_corpus_path, 'r', encoding="utf-8") as stream:
    for line in tqdm(stream, desc='Reading lines...'):
        line = line.strip()
        if line:
            counter.update(line.split())

# Most frequent tokens
topk_tokens = counter.most_common()[:max_vocabulary_size]
logging.info('Final vocabulary size: %s', len(topk_tokens))
logging.info('Most frequent token: \'%s\' (%s)', topk_tokens[0][0], topk_tokens[0][1])
logging.info('Least frequent token: \'%s\' (%s)', topk_tokens[-1][0], topk_tokens[-1][1])

22/07/2023 09:51:26 | PID: 19484 | 1533991252.py | INFO - Reading corpus file: data\formatted\Bangla_Pretraining_Data.formatted.txt
Reading lines...: 6770742it [00:21, 321158.98it/s]
22/07/2023 09:51:47 | PID: 19484 | 1533991252.py | INFO - Final vocabulary size: 30522
22/07/2023 09:51:47 | PID: 19484 | 1533991252.py | INFO - Most frequent token: '।' (6395384)
22/07/2023 09:51:47 | PID: 19484 | 1533991252.py | INFO - Least frequent token: 'পানিতেই' (101)


In [16]:
# Save Masked Language Modeling vocabulary
with open(mlm_vocabulary_fpath, 'w', encoding="utf-8") as f:
    for token, count in topk_tokens:
        f.write(f"{count} {token}\n")