In [None]:
%run ./../data/load-dataset.ipynb

In [None]:
import re
import pandas as pd
from string import printable
from tqdm import tqdm
from gensim.utils import deaccent
from gensim.parsing.preprocessing import lower_to_unicode, strip_tags, strip_punctuation, \
                                         strip_non_alphanum, split_alphanum, strip_numeric, strip_short, \
                                         remove_stopwords, stem_text, strip_multiple_whitespaces
from gensim.corpora import Dictionary
from spacy.lang.en import English

In [None]:
TOKENIZED_DATASET_PATH = f'{PROC_DATA_DIR}/rnn_dataset.pkl'
TOKENS_DICTIONARY_PATH = f'{PROC_DATA_DIR}/rnn_dictionary.dict'

REPLACEMENT_TOK = '@'

In [None]:
def apply_filters(x, filters):
    for f in filters:
        x = f(x)
    return x


DOCUMENT_FILTERS = [deaccent, lower_to_unicode, strip_tags, strip_multiple_whitespaces]

In [None]:
rep = {
    '‚': ',',
    'ʼ': "'", '’': "'",
    '“': '"', '”': '"',
    '«': '"', '»': '"',
    '‐': '-', '‑': '-', '‒': '-', '–': '-', '—': '-', '―': '-', '−': '-',
    '․': '.', '‥': '..', '…': '...',
    '‼': '!!', '⁇': '??', '⁈': '?!', '⁉': '!?',
    '⁓': '~', '∼': '~',
}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile('|'.join(rep.keys()))

In [None]:
if not Path(TOKENIZED_DATASET_PATH).is_file():
    df[proc_doc_col] = df[doc_col].values
    tokenizer = English().tokenizer

    logger.info("Preprocessing corpus...")
    df[proc_doc_col] = [apply_filters(doc, filters=DOCUMENT_FILTERS)
                        for doc in tqdm(df[proc_doc_col], disable=SILENT)]

    logger.info("Replacing special characters...")
    df[proc_doc_col] = [pattern.sub(lambda m: rep[re.escape(m.group(0))], doc)
                        for doc in tqdm(df[proc_doc_col], disable=SILENT)]

    logger.info("Removing unprintable characters...")
    df[proc_doc_col] = [''.join(c for c in doc if c in printable)
                        for doc in tqdm(df[proc_doc_col], disable=SILENT)]

    logger.info("Tokenizing corpus...")
    df[proc_doc_col] = [[t.text for t in tokenizer(doc) if not t.is_space]
                        for doc in tqdm(df[proc_doc_col], disable=SILENT)]

    dictionary = Dictionary(df[proc_doc_col])
    dictionary.filter_extremes(no_below=MIN_OCCURENCES, no_above=1, keep_n=None)

    logger.info("Replacing unfrequent tokens in corpus...")
    df[proc_doc_col] = [[t if t in dictionary.token2id else REPLACEMENT_TOK for t in doc]
                        for doc in tqdm(df[proc_doc_col], disable=SILENT)]

    dictionary = Dictionary(df[proc_doc_col])

    logger.info("Saving tokenized dataset to disk...")
    df.to_pickle(TOKENIZED_DATASET_PATH), dictionary.save(TOKENS_DICTIONARY_PATH)
    logger.info("Tokenized dataset saved to disk.")
else:
    df, dictionary = pd.read_pickle(TOKENIZED_DATASET_PATH), Dictionary.load(TOKENS_DICTIONARY_PATH)