# Dataset preparation

This notebook prepares the original datasets for training and testing the text classifiers

In [None]:
import json
import pandas as pd
import shutil
import os

In [None]:
seed = 23

In [None]:
# download datasets
%run 'download_dataset.py'

In [None]:
data_path = "./data"
datasets = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f)) and f[0] != "."])

### Load dataset function

In [None]:
def load_dataset(dataset, data_path="./data"):
    df = None
    path = os.path.join(data_path, dataset)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        # adjust DataFrame based on original data structure
        if len(df.columns)==1:
            # json contains only texts (GPT3 samples)
            df.rename(columns={0: 'text'}, inplace=True)
        elif 'article' in df.columns:
            # Grover datasets
            df.rename(columns={'article': 'text'}, inplace=True)
    return df

### Preprocessing functions

Tokenization and filtering function to preprocess datasets (removing digits-only tokens and non-english symbols)

In [None]:
import preprocessing as pp
from sklearn.pipeline import make_pipeline

def tokenize_corpus(corpus):
    tokenized_corpus = make_pipeline(
        pp.WordTokenizer(), 
        pp.WordsFilter(drop_symbols=True, drop_digits=True)
    ).fit_transform(corpus)
    return tokenized_corpus

## Prepare training data

Select higher quality samples from the training datasets by using an external [English vocabulary](https://github.com/dwyl/english-words) to evaluate ratio of english words in the training datasets.

In [None]:
output_path = "./data/training"

In [None]:
training_datasets = ['webtext.train.jsonl', 'GPT2-xl-1542M.train.jsonl', 'GPT2-xl-1542M-k40.train.jsonl']

In [None]:
with open("words_dictionary.json", "r") as vocab_file:
    eng_vocab = set(json.loads(vocab_file.read()))

In [None]:
for ds in training_datasets:
    df = load_dataset(ds)
    eng_vocab_overlap = []
    eng_ratio = []
    for doc_tokens in tokenize_corpus(df['text'].to_list()):
        count = 0
        for token in doc_tokens:
            if token in eng_vocab:
                count += 1
        eng_vocab_overlap.append(count)
        if len(doc_tokens) == 0:
            eng_ratio.append(0)
        else:
            eng_ratio.append(int(count*100/len(doc_tokens)))
    df = df[['id', 'text']]
    df["english words"] = eng_vocab_overlap
    df["english %"] = eng_ratio
    df_filtered = df[df["english %"]>=90]
    df_filtered = df_filtered[df_filtered["english words"]>20]
    df_filtered.sample(n=200000, random_state=seed).to_json(
        os.path.join(output_path, f'{os.path.splitext(ds)[0]}.filtered.jsonl'), 
        orient="records", 
        lines=True)

## Prepare test data

Reorganize test data separating "machine" vs "human" texts

In [None]:
output_path = "./data/test"

In [None]:
for ds in [d for d in datasets if 'test' in d]:
    path = os.path.join(data_path, ds)
    if os.path.exists(path):
        name, ext = os.path.splitext(ds)
        df = load_dataset(ds)
        df['id'] = df.index
        if 'GPT' in ds:
            df[['id', 'text']].to_json(os.path.join(output_path, f'{name}.machine.jsonl'), orient="records", lines=True)
            #shutil.copy(path, os.path.join(output_path, f"{name}.machine{ext}"))
        elif 'webtext' in ds:
            df[['id', 'text']].to_json(os.path.join(output_path, f'{name}.human.jsonl'), orient="records", lines=True)
            #shutil.copy(path, os.path.join(output_path, f"{name}.machine{ext}"))
        elif 'Grover' in ds:
            # need to unpack datasets to differentiate machine/human sources
            human_texts = df[df.label == "human"]
            machine_texts = df[df.label == "machine"]
            # Grover human samples are the same regardless of generator-size
            grover_human_path = os.path.join(output_path, f'Grover.human.jsonl')
            if not os.path.exists(grover_human_path):
                human_texts[['id', 'text']].to_json(grover_human_path, orient="records", lines=True)
            machine_texts[['id', 'text']].to_json(os.path.join(output_path, f'{name}.machine.jsonl'), orient="records", lines=True)