In [None]:
%run ./../utils/_logger.ipynb
%run ./../utils/_preprocess-utils.ipynb

In [None]:
from tqdm import tqdm
from spacy.lang.en import English

In [None]:
MIN_OCCURENCES = 5
NO_ABOVE = 1
DOCUMENT_FILTERS = (deaccent, lower_to_unicode, strip_tags, strip_multiple_whitespaces)

In [None]:
df[proc_doc_col] = df[doc_col].values

In [None]:
logger.info("Preprocessing corpus...")
df[proc_doc_col] = [apply_filters(doc, filters=DOCUMENT_FILTERS) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Replacing special characters...")
df[proc_doc_col] = [sub_pattern(doc, pattern=SUB_PATTERN) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Removing unprintable characters...")
df[proc_doc_col] = [remove_unprintable(doc) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Tokenizing corpus...")
tokenizer = English().tokenizer
df[proc_doc_col] = [[t.text for t in tokenizer(doc) if not t.is_space] for doc in tqdm(df[proc_doc_col], disable=SILENT)]

In [None]:
# make auxiliary dictionary from processed corpus
dictionary = Dictionary(df[proc_doc_col])
# filter tokens that appear in few documents from dictionary
dictionary.filter_extremes(no_below=MIN_OCCURENCES, no_above=NO_ABOVE, keep_n=None)

logger.info("Replacing unfrequent tokens in corpus...")
df[proc_doc_col] = [[t if t in dictionary.token2id else REPLACEMENT_TOK for t in doc] for doc in tqdm(df[proc_doc_col], disable=SILENT)]

In [None]:
# create dictionary from processed and subsampled corpus
dictionary = Dictionary(df[proc_doc_col])

In [None]:
logger.info("Saving tokenized dataset to disk...")
df.to_pickle(TOK_DATASET_PATH), dictionary.save(TOK_DICTIONARY_PATH)
logger.info("Tokenized dataset saved to disk.")