In [None]:
%run ./../utils/_logger.ipynb
%run ./../utils/_preprocess-utils.ipynb

In [None]:
from tqdm import tqdm
from gensim.utils import tokenize

In [None]:
N_SAMPLES_TO_LOAD = 300000
N_MAX_SAMPLES = 50000

In [None]:
MIN_LENGTH = 300
MIN_UNIQUE = 50
MIN_OCCURENCES = 5
NO_ABOVE = .5
DOCUMENT_FILTERS = (deaccent, lower_to_unicode, strip_tags, strip_punctuation, 
                    strip_non_alphanum, split_alphanum, strip_numeric, strip_short, 
                    remove_stopwords, strip_multiple_whitespaces)

In [None]:
df = pd.read_csv(SOURCE_DATASET_PATH, nrows=N_SAMPLES_TO_LOAD)

In [None]:
# drop empty and duplicate documents
df.dropna(subset=[doc_col], inplace=True)
df.drop_duplicates(subset=[doc_col], keep='first', inplace=True, ignore_index=True)
df.drop_duplicates(subset=[title_col], keep='first', inplace=True, ignore_index=True)
df.drop_duplicates(subset=[url_col], keep='first', inplace=True, ignore_index=True)

In [None]:
df[proc_doc_col] = df[doc_col].values

In [None]:
logger.info("Preprocessing corpus...")
df[proc_doc_col] = [apply_filters(doc, filters=DOCUMENT_FILTERS) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Replacing special characters...")
df[proc_doc_col] = [sub_pattern(doc, pattern=SUB_PATTERN) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Removing unprintable characters...")
df[proc_doc_col] = [remove_unprintable(doc) for doc in tqdm(df[proc_doc_col], disable=SILENT)]

logger.info("Tokenizing corpus...")
df[proc_doc_col] = [[t for t in tokenize(doc)] for doc in tqdm(df[proc_doc_col], disable=SILENT)]

In [None]:
# preliminarily drop documents deemed too short and those that don't contain enough unique tokens
if MIN_LENGTH is not None:
    df.drop([i for i, row in df.iterrows() if len(row[proc_doc_col]) < MIN_LENGTH], inplace=True)
if MIN_UNIQUE is not None:
    df.drop([i for i, row in df.iterrows() if len(set(row[proc_doc_col])) < MIN_UNIQUE], inplace=True)

# make auxiliary dictionary from processed corpus
dictionary = Dictionary(df[proc_doc_col])
# filter tokens that appear in few documents from dictionary
dictionary.filter_extremes(no_below=MIN_OCCURENCES, no_above=NO_ABOVE, keep_n=None)

logger.info("Removing unfrequent tokens in corpus...")
df[proc_doc_col] = [[t for t in doc if t in dictionary.token2id] for doc in tqdm(df[proc_doc_col], disable=SILENT)]

# drop documents that contain few tokens that remained in the dictionary
if MIN_LENGTH is not None:
    df.drop([i for i, row in df.iterrows() if len([t for t in row[proc_doc_col] if t in dictionary.token2id]) < MIN_LENGTH], inplace=True)
# drop documents that contain few unique tokens that remained in the dictionary
if MIN_UNIQUE is not None:
    df.drop([i for i, row in df.iterrows() if len({t for t in row[proc_doc_col] if t in dictionary.token2id}) < MIN_UNIQUE], inplace=True)

In [None]:
logger.info("Subsampling the dataset...")

n_samples = min(len(df), N_MAX_SAMPLES)

neg_label, pos_label = False, True
neg_df, pos_df = df.loc[df[label_col] == neg_label], df.loc[df[label_col] == pos_label]

n_neg = min(len(neg_df), int(n_samples * .5))
n_pos = min(len(pos_df), int(n_samples * .5))

neg_df, pos_df = neg_df.sample(n=n_neg, random_state=RANDOM_SEED), \
                 pos_df.sample(n=n_pos, random_state=RANDOM_SEED)
df = pd.concat([neg_df, pos_df]).sample(frac=1, ignore_index=True, random_state=RANDOM_SEED)

logger.info("Dataset subsampled.")

In [None]:
# reset index to account for removed entries
df.reset_index(drop=True, inplace=True)

In [None]:
# create dictionary from processed and subsampled corpus
dictionary = Dictionary(df[proc_doc_col])

In [None]:
logger.info("Saving preprocessed dataset to disk...")
Path(PROC_DATA_DIR).mkdir(parents=True, exist_ok=True)
df.to_pickle(PROC_DATASET_PATH), dictionary.save(DICTIONARY_PATH)
logger.info("Preprocessed dataset saved to disk.")