# Baseline text classification experiment

This notebook builds and tests the baseline "human"/"machine" text classifier proposed in [TBD-citation](link to publication) by first extracting the features form the training datasets and then testing multiple classifiers on the test data.

In [None]:
import json
import pandas as pd
import os

In [None]:
seed = 23

In [None]:
data_path = 'data'

## Feature Extraction

### Load training datasets

In [None]:
corpora = []
labels = []

In [None]:
training_data_path = os.path.join(data_path, 'training')

In [None]:
gpt2xl_df = pd.read_json(os.path.join(training_data_path, 'GPT2-xl-1542M.train.filtered.jsonl'), lines = True)
gpt2xl_k40_df = pd.read_json(os.path.join(training_data_path, 'GPT2-xl-1542M-k40.train.filtered.jsonl'), lines = True)
webtext_df = pd.read_json(os.path.join(training_data_path, 'webtext.train.filtered.jsonl'), lines = True)

In [None]:
samples = 100000

Training set 1: 200k samples, half drawn from WebText and the rest from GPT-2 (random) generations

In [None]:
corpora.append(webtext_df.sample(n=samples, random_state=seed)['text'].to_list() + gpt2xl_df.sample(n=samples, random_state=seed)['text'].to_list())
labels.append([0 for _ in range(samples)] + [1 for _ in range(samples)])

Training set 2: 200k samples, half drawn from WebText and the rest from GPT-2 (k40) generations

In [None]:
corpora.append(webtext_df.sample(n=samples, random_state=seed)['text'].to_list() + gpt2xl_k40_df.sample(n=samples, random_state=seed)['text'].to_list())
labels.append([0 for _ in range(samples)] + [1 for _ in range(samples)])

Training set 3: 400k samples,  half drawn from WebText, and the rest equally sampled from GPT-2 (random)
and GPT-2 (k40) generations

In [None]:
corpora.append(webtext_df.sample(n=2*samples, random_state=seed)['text'].to_list() + gpt2xl_df.sample(n=samples, random_state=seed)['text'].to_list() + gpt2xl_k40_df.sample(n=samples, random_state=seed)['text'].to_list())
labels.append([0 for _ in range(2*samples)] + [1 for _ in range(2*samples)])

### Build vocabularies

To prevent words exclusive to one or the other dataset to influence classifiers downstream, shared vocabularies that act as "whitelists" to retain tokens from the training sets are first computed.

In [None]:
import preprocessing as pp
from sklearn.pipeline import make_pipeline

def tokenize_corpus(corpus, whitelist=None):
    tokenized_corpus = make_pipeline(
        pp.WordTokenizer(), 
        pp.WordsFilter(drop_symbols=False, drop_digits=True, whitelist=whitelist)
    ).fit_transform(corpus)
    return tokenized_corpus

In [None]:
tokenized_corpora = [tokenize_corpus(corpus) for corpus in corpora]

In [None]:
vocab_shared = []
for i, tc in enumerate(tokenized_corpora):
    vocab_human = pp.get_vocabulary(tc[:int(len(tc)/2)])
    vocab_machine = pp.get_vocabulary(tc[int(len(tc)/2):])
    vocab_shared.append(vocab_human.intersection(vocab_machine))
    print(f'Training set ({i+1}) vocabulary sizes:\n  "human":\t{len(vocab_human)}\n  "machine:\t{len(vocab_machine)}\n  "shared":\t{len(vocab_shared[i])}')

### Tokenize and filter datasets

Training datasets are tokenized and filtered again using the computed shared vocabolaries as whitelists

In [None]:
tokenized_corpora = [tokenize_corpus(corpus, whitelist=vocab) for (corpus, vocab) in zip(corpora, vocab_shared)]

Test datasets are also preprocess using the same approach

In [None]:
test_data_path = os.path.join(data_path, 'test')

In [None]:
test_data = []
test_labels = []

In [None]:
test_data_files = sorted([f for f in os.listdir(test_data_path) if os.path.isfile(os.path.join(test_data_path, f)) and f[0]!="."])
for test_data_file in test_data_files:
    df = pd.read_json(os.path.join(test_data_path, test_data_file), lines = True)
    test_data.append(df['text'].to_list())
    if 'human' in test_data_file:
        test_labels.append([0 for _ in range(len(df))])
    else:
        test_labels.append([1 for _ in range(len(df))])

In [None]:
tokenized_test_data = [tokenize_corpus(data) for data in test_data]

### POS tagging with Spacy

Extract POS tags using [Spacy](https://spacy.io/) (_en_core_web_sm_ model for English) from the corpus

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm", enable=['tok2vec', 'tagger'])

In [None]:
import time
import sys

fmt = "  Progress: {:>3}% estimated {:>3}s remaining"

In [None]:
def get_pos_tags(datasets):
    tagged_datasets = []
    for i, dataset in enumerate(datasets):
        start = time.perf_counter()
        print(f'Extracting POS tags from dataset {i+1} of {len(datasets)}')
        size = len(dataset)
        sys.stdout.write(f" processing text 1/{size}")
        tagged_dataset = []
        j = 0
        for doc in nlp.pipe(dataset):
            tokens = nlp(doc)
            tagged_dataset.append([tk.tag_ for tk in tokens if not tk.is_stop])
            stop = time.perf_counter()
            remaining = round((stop - start) * (size / (j+1) - 1))
            sys.stdout.write(f"\r processing text {j+1}/{size} - {fmt.format(100 * (j+1) // size, remaining)}")
            j += 1
        tagged_datasets.append(tagged_dataset)
        print('\n')
    return tagged_datasets

In [None]:
tagged_corpora = get_pos_tags(corpora)

Extract POS tags from test datasets

In [None]:
tagged_test_data = get_pos_tags(test_data)

## TF-IDF with N-Grams

Build weighted document-term matrices to vectorize textual data from the training data

### Model fitting

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

In [None]:
def identity(x):
    return x

Representation for N-grams of up to 3 words are fitted on the tokenized training corpus

In [None]:
words_pipelines = []

In [None]:
for X in tokenized_corpora:
    pipeline = make_pipeline(
        TfidfVectorizer(
            ngram_range=(1,3), 
            max_features=1000000, 
            sublinear_tf=True,
            min_df=3,
            tokenizer=identity, 
            preprocessor=identity),
        TruncatedSVD(n_components=500)
    )
    pipeline.fit(X)
    words_pipelines.append(pipeline)

Representation for N-grams of 3 to 5 POS tags are fitted on the training corpus preprocessed using Spacy

In [None]:
tags_pipelines = []

In [None]:
for X in tagged_corpora:
    pipeline = make_pipeline(
        TfidfVectorizer(
            ngram_range=(3,5), 
            max_features=1000000, 
            sublinear_tf=True,
            tokenizer=identity, 
            preprocessor=identity),
        TruncatedSVD(n_components=300)
    )
    pipeline.fit(X)
    tags_pipelines.append(pipeline)

### Dataset vectorization

The fitted TF-IDF model is used to vectorize training and testing datasets

In [None]:
embed_corpora = [pipe.transform(ds) for (pipe, ds) in zip(words_pipelines, tokenized_corpora)]
embed_tagged_corpora = [pipe.transform(ds) for (pipe, ds) in zip(tags_pipelines, tagged_corpora)]
embed_combined_corpora = [np.concatenate([ec, etc], axis=1) for (ec, etc) in zip(embed_corpora, embed_tagged_corpora)]

In [None]:
embed_test_data = [[pipe.transform(ds) for ds in tokenized_test_data] for pipe in words_pipelines]
embed_tagged_test_data = [[pipe.transform(ds) for ds in tagged_test_data] for pipe in tags_pipelines]
embed_combined_test_data = [[np.concatenate([etd, ettd], axis=1) for (etd, ettd) in zip(etds, ettds)] for (etds, ettds) in zip(embed_test_data, embed_tagged_test_data)]

## Classification

The classification task is addressed using [XGBoost](https://xgboost.readthedocs.io/en/latest/index.html)

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import numpy as np
import multiprocessing

In [None]:
# classifiers trained on words-only TF-IDF embeddings
words_classifiers = []
# classifiers trained on POS tags-only TF-IDF embeddings
tags_classifiers = []
# classifiers trained on words and POS tags TF-IDF combined embeddings
combined_classifiers = []

### Training

In [None]:
for (embed_corpus, embed_tagged_corpus, embed_combined_corpus, y) in zip(embed_corpora, embed_tagged_corpora, embed_combined_corpora, labels):
    for (X, classifiers) in zip([embed_corpus, embed_tagged_corpus, embed_combined_corpus], [words_classifiers, tags_classifiers, combined_classifiers]):
        clf = GridSearchCV(
            xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"),
            {"max_depth": [3, 4, 5], "n_estimators": [500, 1000, 1500]},
            #{"max_depth": [3], "n_estimators": [1500]},  # best recorded settings
            verbose=1,
            n_jobs=2)
        clf.fit(X, y)
        classifiers.append(clf)

### Testing

Compute prediction accuracy for each test datasets (binary-classification)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# predictions of classifiers trained on words-only TF-IDF embeddings
words_clf_predictions = []
# predictions of classifiers trained on POS tags-only TF-IDF embeddings
tags_clf_predictions = []
# predictions of classifiers trained on words and POS tags TF-IDF combined embeddings
combined_clf_predictions = []

In [None]:
for i in range(len(embed_test_data)):
    w_preds = []
    t_preds = []
    wt_preds = []
    for (embed_test_set, embed_tag_test_set, embed_combined_test_set) in zip(
        embed_test_data[i], embed_tagged_test_data[i], embed_combined_test_data[i]):
        for (X, clf, preds) in zip([embed_test_set, embed_tag_test_set, embed_combined_test_set],
                                   [words_classifiers[i], tags_classifiers[i], combined_classifiers[i]],
                                   [w_preds, t_preds, wt_preds]):
            preds.append([round(y_pred) for y_pred in clf.best_estimator_.predict(X)])
    for preds, i_preds in zip([words_clf_predictions, tags_clf_predictions, combined_clf_predictions], [w_preds, t_preds, wt_preds]):
        preds.append(i_preds)

In [None]:
results = []
for i, test_data_file in enumerate(test_data_files):
    results_line = [".".join(test_data_file.split(".test")[0:1])]
    if "machine" in test_data_file:
        results_line.append("machine")
    else:
        results_line.append("human")
    results_line.append(len(test_labels[i]))
    for (words_ds_pred, tags_ds_pred, combined_ds_pred) in zip(words_clf_predictions, tags_clf_predictions, combined_clf_predictions):
        for clf_pred in [words_ds_pred, tags_ds_pred, combined_ds_pred]:
            results_line.append(accuracy_score(test_labels[i], clf_pred[i]))
    results.append(results_line)

In [None]:
cols = ["dataset", "source", "size"]
for i in range(len(words_clf_predictions)):
    cols += [f"(ds-{i+1}) words", f"(ds-{i+1}) tags", f"(ds-{i+1}) words+tags"]

In [None]:
results_df = pd.DataFrame(results, columns=cols)

In [None]:
results_df