In [None]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary

In [None]:
def preprocess_data(df, doc_col='doc', proc_doc_col='proc_doc', label_col='label', filters=None,
                    drop_below_len=None, drop_below_unique=None, drop_below_occurences=None, 
                    inplace=False):
    if not inplace:
        df = df.copy(deep=True)
    
    # drop empty and duplicate documents
    df.dropna(subset=[doc_col], inplace=True)
    df.drop_duplicates(subset=[doc_col], keep='first', inplace=True)
    # preprocess documents using several filters, turning each document in a list of tokens
    df[proc_doc_col] = [preprocess_string(doc, filters=filters) for doc in df[doc_col]]
    
    # preliminarily drop documents deemed too short and those that don't contain enough unique words
    if drop_below_len is not None:
        df.drop([i for i, row in df.iterrows() if len(row[proc_doc_col]) < drop_below_len], inplace=True)
    if drop_below_unique is not None:
        df.drop([i for i, row in df.iterrows() if len(set(row[proc_doc_col])) < drop_below_unique], inplace=True)
    
    # make auxiliary dictionary from processed corpus
    dictionary = Dictionary(df[proc_doc_col])
    # filter tokens that appear in few documents from dictionary
    if drop_below_occurences is not None:
        dictionary.filter_extremes(no_below=drop_below_occurences)
    # remove from processed corpus words that were removed from dictionary
    df[proc_doc_col] = [[dictionary[word_idx] for word_idx
                         in dictionary.doc2idx(doc) if word_idx != -1]
                         for doc in df[proc_doc_col]]
    
    # drop documents that contain few words that remained in the dictionary
    if drop_below_len is not None:
        df.drop([i for i, row in df.iterrows() 
                 if len([word_idx for word_idx in dictionary.doc2idx(row[proc_doc_col]) 
                         if word_idx != -1]) < drop_below_len], inplace=True)
    # drop documents that contain few unique words that remained in the dictionary
    if drop_below_unique is not None:
        df.drop([i for i, row in df.iterrows() 
                 if len({word_idx for word_idx in dictionary.doc2idx(row[proc_doc_col]) 
                         if word_idx != -1}) < drop_below_unique], inplace=True)
    
    # reset index to account for removed entries
    df.reset_index(drop=True, inplace=True)
    return df

In [None]:
def subsample_data(df, positive_frac=None, max_samples=None,
                   label_col='label', label_vals=(False, True),
                   random_seed=None):
    n_samples = min(len(df), max_samples) if max_samples is not None else len(df)
    if positive_frac is None:
        return df.sample(n=n_samples, ignore_index=True, random_state=random_seed)

    neg_label, pos_label = label_vals
    neg_df, pos_df = df.loc[df[label_col] == neg_label], df.loc[df[label_col] == pos_label]

    n_neg = min(len(neg_df), int(n_samples * (1.0 - positive_frac)))
    n_pos = min(len(pos_df), int(n_samples * positive_frac))

    neg_df, pos_df = neg_df.sample(n=n_neg, random_state=random_seed), \
                     pos_df.sample(n=n_pos, random_state=random_seed)
    return pd.concat([neg_df, pos_df]).sample(frac=1, ignore_index=True, random_state=random_seed)

In [None]:
logger.info("Loading dataset from disk...")
df = pd.read_csv(DATASET_PATH, nrows=N_SAMPLES_TO_LOAD)
logger.info("Dataset loaded.")

In [None]:
logger.info("Preprocessing the dataset...")
df = preprocess_data(df, doc_col=doc_col, proc_doc_col=proc_doc_col, label_col=label_col, 
                     drop_below_len=MIN_LENGTH, drop_below_unique=MIN_UNIQUE, 
                     drop_below_occurences=MIN_OCCURENCES, 
                     filters=DOCUMENT_FILTERS, inplace=True)
logger.info("Dataset preprocessed.")

In [None]:
logger.info("Subsampling the dataset...")
df = subsample_data(df, positive_frac=POS_FRAC, max_samples=N_MAX_SAMPLES,
                    label_col=label_col, label_vals=(False, True),
                    random_seed=RANDOM_SEED)
logger.info("Dataset subsampled.")

In [None]:
# create dictionary from processed corpus
dictionary = Dictionary([['<PAD>']] + df[proc_doc_col].values.tolist())
dictionary.cfs[0] = dictionary.dfs[0] = 0
dictionary.num_docs -= 1
dictionary.num_pos -= 1
dictionary.num_nnz -= 1

In [None]:
logger.info("Saving preprocessed dataset to disk...")
Path(PROC_DATA_DIR).mkdir(parents=True, exist_ok=True)
df.to_pickle(PROC_DATASET_PATH), dictionary.save(DICTIONARY_PATH)
logger.info("Preprocessed dataset saved to disk.")