In [18]:
import json
import pandas as pd
import os

In [19]:
data_path = "./"
embed_path = "./output/train/embed"

In [20]:
import pickle

def load_dataset(file_path, prefix_path=data_path):
    ds = None
    full_path = os.path.join(prefix_path, file_path)
    if os.path.splitext(file_path)[-1] == ".csv":
        ds = pd.read_csv(full_path)
    elif os.path.splitext(file_path)[-1] == ".jsonl":
        ds = pd.read_json(full_path, lines=True)
    elif os.path.splitext(file_path)[-1] == ".pickle":
        with open(full_path, "rb") as f:
            ds = pickle.load(f)
    return ds

# Words-embedding

## Load training datasets

In [21]:
real_data_file = 'webtext.train.jsonl.clean100k.csv'
fake_data_file = 'xl-1542M.train.jsonl.clean100k.csv'

In [22]:
df_real = load_dataset(real_data_file)
df_fake = load_dataset(fake_data_file)

corpus = df_real["text"].to_list() + df_fake["text"].to_list()
labels = [0 for _ in range(len(df_real))] + [1 for _ in range(len(df_fake))]

## Build vocabulary

In [23]:
from nlp_engine.preprocessing import transformers as tfs
from sklearn.pipeline import make_pipeline

In [36]:
tokenizer = make_pipeline(
    tfs.WordTokenizer(), 
    tfs.WordsFilter(drop_symbols=False, drop_digits=True)
)

In [26]:
tokenized_corpus = tokenizer.transform(corpus)

In [27]:
from nlp_engine.analysis import vocabulary

In [28]:
vocab_real = vocabulary.get_vocabulary(tokenized_corpus[:len(df_real)])
vocab_fake = vocabulary.get_vocabulary(tokenized_corpus[len(df_real):])

In [29]:
vocab_shared = vocab_real.intersection(vocab_fake)

In [30]:
print(f'Size of "real" vocabulary: {len(vocab_real)}\nSize of "fake" vocabulary: {len(vocab_fake)}\nSize of intersection: {len(vocab_shared)}')

Size of "real" vocabulary: 290187
Size of "fake" vocabulary: 511432
Size of intersection: 160741


## Compute TF-IDF embedding

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [32]:
def identity(x):
    return x

In [33]:
preprocessing_pipeline = make_pipeline(
    tfs.WordTokenizer(),
    tfs.WordsFilter(
        drop_symbols=False, 
        drop_digits=True, 
        whitelist=vocab_shared)
)

In [None]:
%%time
corpus_tokenized = preprocessing_pipeline.transform(corpus)

In [20]:
words_pipeline = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1,3), 
        max_features=1000000, 
        sublinear_tf=True, 
        tokenizer=identity, 
        preprocessor=identity),
    TruncatedSVD(n_components=500)
)

In [21]:
%%time
words_pipeline.fit(corpus_tokenized, labels)



CPU times: user 34min 47s, sys: 21min 20s, total: 56min 8s
Wall time: 47min 46s


Transform training data and store it

In [23]:
real_embed = words_pipeline.transform(corpus_tokenized[:len(df_real["text"].to_list())])
fake_embed = words_pipeline.transform(corpus_tokenized[-len(df_fake["text"].to_list()):])

In [24]:
import pickle

embed_path = "./output/train/embed"

with open(os.path.join(
    embed_path, f"{real_data_file.replace('.jsonl.clean100k.csv','')}.human_embed.pickle"), "wb") as f:
    pickle.dump(real_embed, f)
with open(os.path.join(
    embed_path, f"{fake_data_file.replace('.jsonl.clean100k.csv','')}.machine_embed.pickle"), "wb") as f:
    pickle.dump(fake_embed, f)

## Embed test datasets

In [37]:
test_path = "./data/test"
test_dataset_files = sorted([f for f in os.listdir(test_path) if os.path.isfile(os.path.join(test_path, f)) and f[0]!="."])

Store text stats

In [47]:
stats_path = "./output/test/stats"

for test in test_dataset_files:
    with open(os.path.join(test_path, test), "rb") as f:
        X = pickle.load(f)
    X_tokens_raw = tokenizer.transform(X)
    X_tokens = preprocessing_pipeline.transform(X)
    stats = []
    for (tokens_raw, tokens) in zip(X_tokens_raw, X_tokens):
        raw_text_length = len("".join(tokens_raw))
        raw_text_tokens = len(tokens_raw)
        text_length = len("".join(tokens))
        text_tokens = len(tokens)
        if raw_text_tokens > 0:
            ratio = text_tokens/raw_text_tokens
        else:
            ratio = 0
        stats.append([raw_text_length, raw_text_tokens, text_length, text_tokens, ratio])
    df = pd.DataFrame(stats, columns=["raw_chars", "raw_tokens", "chars", "tokens", "ratio"])
    df.to_csv(os.path.join(stats_path,f"{os.path.splitext(test)[0]}_stats.csv"))

In [29]:
embed_path = "./output/test/embed"

for test in test_dataset_files:
    with open(os.path.join(test_path, test), "rb") as f:
        X = pickle.load(f)
    X_embed = words_pipeline.transform(preprocessing_pipeline.transform((X))
    with open(os.path.join(embed_path,f"{os.path.splitext(test)[0]}_embed.pickle"), "wb") as f:
        pickle.dump(X_embed, f)

# TAG-embedding

In [30]:
data_path = "./data/tag"
embed_path = "./data/tag/embed"

## Load training datasets

In [31]:
real_data_file = 'webtext.train.jsonl.clean100k.csv.tag.pickle'
fake_data_file = 'xl-1542M.train.jsonl.clean100k.csv.tag.pickle'

In [32]:
ds_real = load_dataset(real_data_file, data_path)
ds_fake = load_dataset(fake_data_file, data_path)

In [34]:
corpus = ds_real + ds_fake
labels = [0 for _ in range(len(ds_real))] + [1 for _ in range(len(ds_fake))]

## Compute TF-IDF embedding

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [36]:
def identity(x):
    return x

In [37]:
tags_pipeline = make_pipeline(
    TfidfVectorizer(
        ngram_range=(3,5), 
        max_features=1000000, 
        sublinear_tf=True, 
        tokenizer=identity, 
        preprocessor=identity),
    TruncatedSVD(n_components=300)
)

In [38]:
%%time
tags_pipeline.fit(corpus, labels)

CPU times: user 26min 42s, sys: 2min 19s, total: 29min 1s
Wall time: 21min 9s


Transform training data and store it

In [40]:
real_embed = tags_pipeline.transform(ds_real)
fake_embed = tags_pipeline.transform(ds_fake)

In [41]:
import pickle

embed_path = "./output/train/embed"

with open(os.path.join(
    embed_path, f"{real_data_file.replace('.jsonl.clean100k.csv','').replace('.pickle','')}.human_embed.pickle"), "wb") as f:
    pickle.dump(real_embed, f)
with open(os.path.join(
    embed_path, f"{fake_data_file.replace('.jsonl.clean100k.csv','').replace('.pickle','')}.machine_embed.pickle"), "wb") as f:
    pickle.dump(fake_embed, f)

## Embed test datasets

In [42]:
test_path = "./data/tag/test"
embed_path = "./output/test/embed"
test_dataset_files = sorted([f for f in os.listdir(test_path) if os.path.isfile(os.path.join(test_path, f)) and f[0]!="." and "tag" in f])

In [43]:
for test in test_dataset_files:
    with open(os.path.join(test_path, test), "rb") as f:
        data = pickle.load(f)
    X_embed = tags_pipeline.transform(data)
    with open(os.path.join(embed_path, test.replace('.pickle','_embed.pickle')), "wb") as f:
        pickle.dump(X_embed, f)

# Classification

## Load training data

In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import xgboost as xgb
import multiprocessing
import pickle
import numpy as np
import pandas as pd

In [49]:
tfidf_real_path = "./output/train/embed/webtext.train.human_embed.pickle"
tfidf_fake_path = "./output/train/embed/xl-1542M.train.machine_embed.pickle"
tag_real_path = "./output/train/embed/webtext.train.tag.human_embed.pickle"
tag_fake_path = "./output/train/embed/xl-1542M.train.tag.machine_embed.pickle"

In [50]:
with open(tfidf_real_path, "rb") as f:
    X_tfidf_real = pickle.load(f)
with open(tfidf_fake_path, "rb") as f:
    X_tfidf_fake = pickle.load(f)
X_tfidf = np.concatenate([X_tfidf_real, X_tfidf_fake])
y_train = [0 for _ in range(len(X_tfidf_real))] + [1 for _ in range(len(X_tfidf_fake))]

with open(tag_real_path, "rb") as f:
    X_tag_real = pickle.load(f)
with open(tag_fake_path, "rb") as f:
    X_tag_fake = pickle.load(f)
    
X_tag = np.concatenate([X_tag_real, X_tag_fake])
    
X_tfidf_tag = np.concatenate([X_tfidf, X_tag], axis=1)

## Train models

In [51]:
clf_tfidf = GridSearchCV(
        xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"),
        #{"max_depth": [3, 4, 5], "n_estimators": [500, 1000, 1500]},
        {"max_depth": [3], "n_estimators": [1500]},
        verbose=1,
        n_jobs=2,
    )

clf_tag = GridSearchCV(
        xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"),
        #{"max_depth": [3, 4, 5], "n_estimators": [500, 1000, 1500]},
        {"max_depth": [3], "n_estimators": [500]},
        verbose=1,
        n_jobs=2,
    )

clf_tfidf_tag = GridSearchCV(
        xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"),
        #{"max_depth": [3, 4, 5], "n_estimators": [500, 1000, 1500]},
        {"max_depth": [3], "n_estimators": [1500]},
        verbose=1,
        n_jobs=2,
    )

In [52]:
%%time
clf_tfidf.fit(X_tfidf, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
CPU times: user 3min 47s, sys: 5.35 s, total: 3min 52s
Wall time: 3min 27s


In [53]:
pd.DataFrame(clf_tfidf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,55.747911,6.012172,0.096856,0.005631,3,1500,"{'max_depth': 3, 'n_estimators': 1500}",0.716675,0.719025,0.71525,0.716375,0.7159,0.716645,0.001284,1


In [54]:
%%time
clf_tag.fit(X_tag, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




CPU times: user 1min 3s, sys: 4.89 s, total: 1min 8s
Wall time: 53.1 s


In [55]:
pd.DataFrame(clf_tag.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,13.52491,1.538569,0.048421,0.010208,3,500,"{'max_depth': 3, 'n_estimators': 500}",0.68415,0.682025,0.6844,0.687175,0.68535,0.68462,0.001677,1


In [56]:
%%time
clf_tfidf_tag.fit(X_tfidf_tag, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




CPU times: user 6min 18s, sys: 8.4 s, total: 6min 26s
Wall time: 5min 41s


In [57]:
pd.DataFrame(clf_tfidf_tag.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,92.510934,13.71437,0.12882,0.018558,3,1500,"{'max_depth': 3, 'n_estimators': 1500}",0.736575,0.736175,0.7371,0.736225,0.738625,0.73694,0.000905,1


## Test models

In [58]:
from sklearn.metrics import accuracy_score

In [59]:
embed_path = "./output/test/embed"
test_dataset_tag_files = sorted([f for f in os.listdir(embed_path) if os.path.isfile(os.path.join(embed_path, f)) and f[0]!="." and "tag" in f])

In [60]:
test_dataset_files = [x.replace(".tag","") for x in test_dataset_tag_files]

In [61]:
for test_dataset_tag_file in test_dataset_tag_files:
    test_dataset_file = test_dataset_tag_file.replace(".tag","")
    with open(os.path.join(embed_path, test_dataset_file), "rb") as f:
        X_test_tfidf = pickle.load(f)
        label = int("machine" in test_dataset_file)
        y_test = [label for _ in range(len(X_test_tfidf))]
    with open(os.path.join(embed_path, test_dataset_tag_file), "rb") as f:
        X_test_tag = pickle.load(f)
    X_test_tfidf_tag = np.concatenate([X_test_tfidf, X_test_tag], axis=1)
    y_pred = [round(yy) for yy in clf_tfidf.best_estimator_.predict(X_test_tfidf)]
    print(f'******\nTest dataset: {test_dataset_file.replace("_embed.pickle","")}')
    print('TF-IDF words:')
    print(accuracy_score(y_test, y_pred))
    print('TF-IDF tags:')
    y_pred = [round(yy) for yy in clf_tag.best_estimator_.predict(X_test_tag)]
    print(accuracy_score(y_test, y_pred))
    print('TF-IDF words+tags:')
    y_pred = [round(yy) for yy in clf_tfidf_tag.best_estimator_.predict(X_test_tfidf_tag)]
    print(accuracy_score(y_test, y_pred))

******
Test dataset: gpt2.large-762M-k40.test.machine
TF-IDF words:
0.0852
TF-IDF tags:
0.1004
TF-IDF words+tags:
0.0888
******
Test dataset: gpt2.large-762M.test.machine
TF-IDF words:
0.7784
TF-IDF tags:
0.7262
TF-IDF words+tags:
0.7956
******
Test dataset: gpt2.medium-345M-k40.test.machine
TF-IDF words:
0.1238
TF-IDF tags:
0.1426
TF-IDF words+tags:
0.137
******
Test dataset: gpt2.medium-345M.test.machine
TF-IDF words:
0.8806
TF-IDF tags:
0.8836
TF-IDF words+tags:
0.9008
******
Test dataset: gpt2.small-117M-k40.test.machine
TF-IDF words:
0.12
TF-IDF tags:
0.114
TF-IDF words+tags:
0.1274
******
Test dataset: gpt2.small-117M.test.machine
TF-IDF words:
0.8724
TF-IDF tags:
0.8308
TF-IDF words+tags:
0.8854
******
Test dataset: gpt2.xl-1542M-k40.test.machine
TF-IDF words:
0.096
TF-IDF tags:
0.1158
TF-IDF words+tags:
0.1002
******
Test dataset: gpt2.xl-1542M.test.machine
TF-IDF words:
0.7578
TF-IDF tags:
0.7062
TF-IDF words+tags:
0.7668
******
Test dataset: gpt3.175b_samples.machine
TF-IDF w

Analyze performance vs document stats

In [88]:
from sklearn.metrics import precision_recall_fscore_support

In [70]:
embed_path = "./output/test/embed"
stats_path = "./output/test/stats"
test_datasets = ["webtext.test.human", "gpt2.xl-1542M.test.machine"]

In [77]:
X_test_tfidf = None
X_test_tag = None
y_test = []
df_stats = None
for test_ds in test_datasets:
    with open(os.path.join(embed_path, f"{test_ds}_embed.pickle"), "rb") as f:
        ds_embed = pickle.load(f)
        label = int("machine" in test_ds)
        if X_test_tfidf is None:
            X_test_tfidf = ds_embed
        else:
            X_test_tfidf = np.concatenate([X_test_tfidf, ds_embed])
        y_test += [label for _ in range(len(ds_embed))]
    with open(os.path.join(embed_path, f"{test_ds}.tag_embed.pickle"), "rb") as f:
        ds_embed = pickle.load(f)
        if X_test_tag is None:
            X_test_tag = ds_embed
        else:
            X_test_tag = np.concatenate([X_test_tag, ds_embed])
    df = pd.read_csv(os.path.join(stats_path, f"{test_ds}_stats.csv"), index_col=0)
    if df_stats is None:
        df_stats = df
    else:
        df_stats = pd.concat([df_stats, df])
X_test_tfidf_tag = np.concatenate([X_test_tfidf, X_test_tag], axis=1)

In [79]:
y_pred_tfidf = [round(yy) for yy in clf_tfidf.best_estimator_.predict(X_test_tfidf)]
y_pred_tag = [round(yy) for yy in clf_tag.best_estimator_.predict(X_test_tag)]
y_pred_tfidf_tag = [round(yy) for yy in clf_tfidf_tag.best_estimator_.predict(X_test_tfidf_tag)]

In [80]:
df_stats["label"] = y_test
df_stats["label_tfidf"] = y_pred_tfidf
df_stats["label_tag"] = y_pred_tag
df_stats["label_tfidf_tag"] = y_pred_tfidf_tag

In [114]:
class_stats = dict(tfidf=[], tag=[], tfidf_tag=[])
for i in sorted(set(df_stats.tokens.to_list())):
    df = df_stats[df_stats.tokens >= i]
    # stop if sample size below 50 elements
    if len(df) < 50:
        break
    for method in class_stats.keys():
        y_true = df.label.to_list()
        y_pred = df[f"label_{method}"].to_list()
        prec, rec, fsc, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        st = [i, prec, rec, fsc] + [item for sublist in precision_recall_fscore_support(y_true, y_pred, average=None) for item in list(sublist)]
        class_stats[method].append(st)

In [116]:
for method in class_stats:
    pd.DataFrame(
        class_stats[method], 
        columns=["min_tokens","precision_m", "recall_m", "f1_m", "precision_human", "precision_machine", "recall_human", "recall_machine", "f1_human", "f1_machine", "support_human", "support_machine"]
    ).to_csv(f"{method}_stats.csv")