# Dataset exploration

This notebook simplifies the analysis and exploration of the datasets used for the human/machine classification task

In [None]:
import json
import pandas as pd
import os

In [None]:
# download datasets
%run 'download_dataset.py'

In [None]:
data_path = "./data"
datasets = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f)) and f[0] != "."])

### Preprocessing functions

Tokenization and filtering function to preprocess datasets (removing digits-only tokens and non-english symbols)

In [None]:
import preprocessing as pp
from sklearn.pipeline import make_pipeline

def tokenize_corpus(corpus):
    tokenized_corpus = make_pipeline(
        pp.WordTokenizer(), 
        pp.WordsFilter(drop_symbols=True, drop_digits=True)
    ).fit_transform(corpus)
    return tokenized_corpus

### Vocabulary extraction

Get vocabulary from corpus using _CountVectorizer_

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def identity(x):
    return x

def get_features(corpus, min_df=1):
    vectorizer = CountVectorizer(min_df=min_df, preprocessor=identity, tokenizer=identity)
    X = vectorizer.fit_transform(corpus)
    return set(vectorizer.get_feature_names_out())

## Dataset analysis

### Word count

Get min/max/avg tokens per dataset

In [None]:
def load_dataset(dataset, data_path="./data"):
    df = None
    path = os.path.join(data_path, dataset)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        # adjust DataFrame based on original data structure
        if len(df.columns)==1:
            # json contains only texts (GPT3 samples)
            df.rename(columns={0: 'text'}, inplace=True)
        elif 'article' in df.columns:
            # Grover datasets
            df.rename(columns={'article': 'text'}, inplace=True)
    return df

In [None]:
tokenized_dataset = {}
stats = {}
for i, dataset in enumerate(datasets):
    df = load_dataset(dataset, data_path)
    if df is not None:
        ds_name = os.path.splitext(dataset)[0]
        
        # store tokens for later analysis
        dataset_tokens[ds_name] = tokenize_corpus(list(df['text']))
        df['length'] = [len(doc) for doc in dataset_tokens[ds_name]]
        
        description = df[['text', 'length']].describe()
        n_entries = description.loc['count']['length']
        max_tokens = int(description.loc['max']['length'])
        min_tokens = int(description.loc['min']['length'])
        avg_tokens = description.loc['mean']['length']
        
        stats[i] = [ds_name, n_entries, max_tokens, min_tokens, avg_tokens]

df_stats = pd.DataFrame.from_dict(stats, 
                                  orient='index', 
                                  columns=["source", "n_entries", "max_tokens", "min_tokens", "avg_tokens"])

In [None]:
df_stats

### Vocabulary analysis

Extract vocabulary from larger datasets flagged as "training"

In [None]:
selected_datasets = [x for x in datasets if "train" in x]

In [None]:
selected_datasets

In [None]:
dataset_features = {}
for dataset in selected_datasets:
    ds = os.path.splitext(dataset)[0]
    # check if matching tokenized dataset already present
    if ds not in dataset_tokens:
        df = load_dataset(dataset)
        dataset_tokens[ds] = tokenize_corpus(list(df['text']))
    dataset_features[ds] = get_features(dataset_tokens[ds], min_df=1)

Compare dataset features with an external [English vocabulary](https://github.com/dwyl/english-words) used as reference.

In [None]:
with open("words_dictionary.json", "r") as vocab_file:
    eng_vocab = set(json.loads(vocab_file.read()))

In [None]:
features_stats = {}
for i, dataset in enumerate(dataset_features):
    vocab_size = len(dataset_features[dataset])
    eng_words = len(eng_vocab.intersection(dataset_features[dataset]))
    ratio = int(eng_words*100 / vocab_size)
    features_stats[i] = [dataset, vocab_size, eng_words, ratio]

df_features_stats = pd.DataFrame.from_dict(features_stats, 
                                           orient='index',
                                           columns=["source", "vocabulary size", "english words", "eng%"])

In [None]:
df_features_stats

Compare features of synthetic datasets (GPT-2) vs human-written dataset (WebText)

In [None]:
human_machine_features_stats = []
ref_human_features = dataset_features['webtext.train']

for dataset in dataset_features:
    if ('GPT2' in dataset and 'webtext' not in dataset):
        feat_union_size = len(dataset_features[dataset].union(ref_human_features))
        feat_intersection_size = len(dataset_features[dataset].intersection(ref_human_features))
        ratio = int(feat_intersection_size*100 / feat_union_size)
        human_machine_features_stats.append([
            f'{dataset} VS WebText', 
            feat_union_size,
            feat_intersection_size,
            ratio])

df_human_machine_features_stats = pd.DataFrame(human_machine_features_stats,
                                               columns=['source', 'combined vocab', 'shared vocab', 'shared ratio'])

In [None]:
df_human_machine_features_stats