In [None]:
import pandas as pd
from tqdm import tqdm
from gensim.parsing.preprocessing import preprocess_string

In [None]:
logger.info("Loading dataset from disk...")
df = pd.read_csv(DATASET_PATH, nrows=N_SAMPLES_TO_LOAD)
logger.info("Dataset loaded.")

In [None]:
logger.info("Preprocessing the dataset...")

# drop empty and duplicate documents
df.dropna(subset=[doc_col], inplace=True)
df.drop_duplicates(subset=[doc_col], keep='first', inplace=True)
# preprocess documents using several filters, turning each document in a list of tokens
df[proc_doc_col] = [preprocess_string(doc, filters=DOCUMENT_FILTERS) for doc in tqdm(df[doc_col], disable=SILENT)]

# preliminarily drop documents deemed too short and those that don't contain enough unique words
if MIN_LENGTH is not None:
    df.drop([i for i, row in df.iterrows() if len(row[proc_doc_col]) < MIN_LENGTH], inplace=True)
if MIN_UNIQUE is not None:
    df.drop([i for i, row in df.iterrows() if len(set(row[proc_doc_col])) < MIN_UNIQUE], inplace=True)

# make auxiliary dictionary from processed corpus
dictionary = Dictionary(df[proc_doc_col])
# filter tokens that appear in few documents from dictionary
if MIN_OCCURENCES is not None:
    dictionary.filter_extremes(no_below=MIN_OCCURENCES)
# remove from processed corpus words that were removed from dictionary
df[proc_doc_col] = [[dictionary[word_idx] for word_idx
                     in dictionary.doc2idx(doc) if word_idx != -1]
                     for doc in df[proc_doc_col]]

# drop documents that contain few words that remained in the dictionary
if MIN_LENGTH is not None:
    df.drop([i for i, row in df.iterrows() 
             if len([word_idx for word_idx in dictionary.doc2idx(row[proc_doc_col]) 
                     if word_idx != -1]) < MIN_LENGTH], inplace=True)
# drop documents that contain few unique words that remained in the dictionary
if MIN_UNIQUE is not None:
    df.drop([i for i, row in df.iterrows() 
             if len({word_idx for word_idx in dictionary.doc2idx(row[proc_doc_col]) 
                     if word_idx != -1}) < MIN_UNIQUE], inplace=True)

# reset index to account for removed entries
df.reset_index(drop=True, inplace=True)

logger.info("Dataset preprocessed.")

In [None]:
logger.info("Subsampling the dataset...")

n_samples = min(len(df), N_MAX_SAMPLES)

neg_label, pos_label = False, True
neg_df, pos_df = df.loc[df[label_col] == neg_label], df.loc[df[label_col] == pos_label]

n_neg = min(len(neg_df), int(n_samples * .5))
n_pos = min(len(pos_df), int(n_samples * .5))

neg_df, pos_df = neg_df.sample(n=n_neg, random_state=RANDOM_SEED), \
                 pos_df.sample(n=n_pos, random_state=RANDOM_SEED)
df = pd.concat([neg_df, pos_df]).sample(frac=1, ignore_index=True, random_state=RANDOM_SEED)

logger.info("Dataset subsampled.")

In [None]:
# create dictionary from processed and subsampled corpus
dictionary = Dictionary(df[proc_doc_col])

In [None]:
logger.info("Saving preprocessed dataset to disk...")
Path(PROC_DATA_DIR).mkdir(parents=True, exist_ok=True)
df.to_pickle(PROC_DATASET_PATH), dictionary.save(DICTIONARY_PATH)
logger.info("Preprocessed dataset saved to disk.")