In [52]:
import json
import pandas as pd
import os
import ipywidgets as widgets

In [53]:
data_path = "./data"
datasets = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))])

## Batch describe all datasets

In [7]:
def get_df_stats(df):
    description = df.describe(include='all')
    n_entries = description.loc['count']['text']
    max_tokens = int(description.loc['max']['length'])
    min_tokens = int(description.loc['min']['length'])
    avg_tokens = description.loc['mean']['length']
    return [n_entries, max_tokens, min_tokens, avg_tokens]

In [10]:
stats = {}
for i, dataset in enumerate(datasets):
    path = os.path.join(data_path, dataset)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        stats[i] = dataset.split('.')[0:2] + get_df_stats(df)

df_stats = pd.DataFrame.from_dict(stats, 
                                  orient='index', 
                                  columns=["source", "split", "n_entries", "max_tokens", "min_tokens", "avg_tokens"])

In [11]:
df_stats

Unnamed: 0,source,split,n_entries,max_tokens,min_tokens,avg_tokens
0,large-762M-k40,test,5000,1024,4,638.4196
1,large-762M-k40,train,250000,1030,1,639.871452
2,large-762M-k40,valid,5000,1024,5,640.9304
3,large-762M,test,5000,1029,2,652.2074
4,large-762M,train,250000,1032,0,654.371136
5,large-762M,valid,5000,1024,0,647.239
6,medium-345M-k40,test,5000,1024,5,656.3938
7,medium-345M-k40,train,250000,1036,1,658.713728
8,medium-345M-k40,valid,5000,1024,6,658.0228
9,medium-345M,test,5000,1032,1,670.952


In [None]:
import plotly.express as px

for y in list(df_stats.columns)[-3:]:
    fig = px.bar(df_stats, x='split', color='source', y=y, barmode='group', title=y)
    fig.show()

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
from nlp_engine.preprocessing import transformers as tfs
from sklearn.pipeline import make_pipeline

def tokenize_corpus(corpus):
    tokenized_corpus = make_pipeline(
        tfs.WordTokenizer(), 
        tfs.WordsFilter(drop_symbols=True, drop_digits=True)
    ).fit_transform(corpus)
    return tokenized_corpus

In [57]:
def get_features(corpus, min_df=1):
    vectorizer = CountVectorizer(min_df=min_df)
    X = vectorizer.fit_transform(corpus)
    return set(vectorizer.get_feature_names_out())

In [9]:
datasets

['large-762M-k40.test.jsonl',
 'large-762M-k40.train.jsonl',
 'large-762M-k40.valid.jsonl',
 'large-762M.test.jsonl',
 'large-762M.train.jsonl',
 'large-762M.valid.jsonl',
 'medium-345M-k40.test.jsonl',
 'medium-345M-k40.train.jsonl',
 'medium-345M-k40.valid.jsonl',
 'medium-345M.test.jsonl',
 'medium-345M.train.jsonl',
 'medium-345M.valid.jsonl',
 'small-117M-k40.test.jsonl',
 'small-117M-k40.train.jsonl',
 'small-117M-k40.valid.jsonl',
 'small-117M.test.jsonl',
 'small-117M.train.jsonl',
 'small-117M.valid.jsonl',
 'webtext.test.jsonl',
 'webtext.train.jsonl',
 'webtext.valid.jsonl',
 'xl-1542M-k40.test.jsonl',
 'xl-1542M-k40.train.jsonl',
 'xl-1542M-k40.valid.jsonl',
 'xl-1542M.test.jsonl',
 'xl-1542M.train.jsonl',
 'xl-1542M.valid.jsonl']

In [15]:
selected_datasets = [x for x in datasets if "train" in x]

In [16]:
selected_datasets

['large-762M-k40.train.jsonl',
 'large-762M.train.jsonl',
 'medium-345M-k40.train.jsonl',
 'medium-345M.train.jsonl',
 'small-117M-k40.train.jsonl',
 'small-117M.train.jsonl',
 'webtext.train.jsonl',
 'xl-1542M-k40.train.jsonl',
 'xl-1542M.train.jsonl']

In [73]:
selected_datasets = ['xl-1542M-k40.train.jsonl']

In [74]:
tokens = {}
for dataset in selected_datasets:
    path = os.path.join(data_path, dataset)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        tokens[dataset] = tokenize_corpus(df['text'].to_list())

In [7]:
import pickle

with open("tokens.pl", "wb") as file:
    pickle.dump(tokens, file)

In [3]:
import pickle

with open("tokens.pl", "rb") as file:
    tokens = pickle.load(file)

In [19]:
features = {}
for dataset in datasets:
    path = os.path.join(data_path, dataset)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        features[dataset] = get_features(df['text'].to_list(), min_df=1)

English vocabulary:

https://github.com/dwyl/english-words

In [59]:
with open("words_dictionary.json", "r") as vocab_file:
    eng_vocab = set(json.loads(vocab_file.read()))

In [5]:
vocab_stats = {}
for i, dataset in enumerate(tokens):
    vocab = set([tk for tks in tokens[dataset] for tk in set(tks)])
    eng_words = len(eng_vocab.intersection(vocab))
    ratio = int(eng_words*100 / len(vocab))
    vocab_stats[i] = dataset.split(".")[0:2] + [len(vocab), eng_words, ratio]

df_features_stats = pd.DataFrame.from_dict(vocab_stats, 
                                           orient='index',
                                           columns=["source", "split", "vocabulary size", "english words", "eng%"])

In [30]:
features_stats = {}
for i, dataset in enumerate(features):
    vocab_size = len(features[dataset])
    eng_words = len(eng_vocab.intersection(features[dataset]))
    ratio = int(eng_words*100 / vocab_size)
    features_stats[i] = dataset.split(".")[0:2] + [vocab_size, eng_words, ratio]

df_features_stats = pd.DataFrame.from_dict(features_stats, 
                                           orient='index',
                                           columns=["source", "split", "vocabulary size", "english words", "eng%"])

In [6]:
df_features_stats#[df_features_stats["split"]=="train"]

Unnamed: 0,source,split,vocabulary size,english words,eng%
0,xl-1542M,train,1432488,133350,9
1,webtext,train,660492,115490,17


In [18]:
features_stats = {}
uncommon_tokens_gen = {}
uncommon_tokens_real = {}
i = 0
for split in ['train']:#, 'test', 'valid']:
    feats = features['.'.join(['webtext', split, 'jsonl'])]
    for dataset in features:
        if split in dataset:
            features_stats[i] = dataset.split(".")[0:2] + [len(features[dataset])]
            i += 1
            if 'webtext' not in dataset:
                uncommon_tokens_gen[dataset] = features[dataset] - feats
                uncommon_tokens_real[dataset] = feats - features[dataset]
                features_stats[i] = [dataset.split(".")[0] + " ∩ WebText", split, len(feats.intersection(features[dataset]))]
                i += 1
    
df_features_stats = pd.DataFrame.from_dict(features_stats, 
                                           orient='index',
                                           columns=["source", "split", "vocabulary size"])

In [77]:
features_stats = {
    0 : ['WebText', "min_df=2", len(features['webtext.train.jsonl'])],
    1 : ['GPT2-xl', "min_df=2", len(features['xl-1542M.train.jsonl'])],
    2 : ['WebText ∩ GPT2-xl', "min_df=2", len(features['webtext.train.jsonl'].intersection(features['xl-1542M.train.jsonl']))]
}
df_features_stats = pd.DataFrame.from_dict(features_stats, 
                                           orient='index',
                                           columns=["source", "minimum doc frequency", "vocabulary size"])

In [78]:
df_features_stats_1["minimum doc frequency"] = ["min_df=1", "min_df=1", "min_df=1"]

In [56]:
df_features_stats_1

Unnamed: 0,source,vocabulary size,minimum doc frequency
0,WebText,755966,df=1
1,GPT2-xl,1571962,df=1
2,WebText ∩ GPT2-xl,326689,df=1


In [57]:
df_features_stats

Unnamed: 0,source,minimum doc frequency,vocabulary size
0,WebText,df=2,265616
1,GPT2-xl,df=2,345980
2,WebText ∩ GPT2-xl,df=2,199511


In [79]:
df_features_stats_2 = pd.concat([df_features_stats_1, df_features_stats])

In [61]:
df_features_stats_2

Unnamed: 0,source,vocabulary size,minimum doc frequency
0,WebText,755966,df=1
1,GPT2-xl,1571962,df=1
2,WebText ∩ GPT2-xl,326689,df=1
0,WebText,265616,df=2
1,GPT2-xl,345980,df=2
2,WebText ∩ GPT2-xl,199511,df=2


In [37]:
df_features_stats_1 = df_features_stats

In [80]:
import plotly.express as px

px.defaults.template = "simple_white"
fig = px.bar(df_features_stats_2, x='minimum doc frequency', color='source', y='vocabulary size', barmode='group',
            color_discrete_sequence=px.colors.qualitative.D3, width=550, height=450)
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
))
fig.show()

In [47]:
import plotly.express as px

fig = px.bar(df_features_stats, x='source', color='source', y='vocabulary size',# title='Unique words in corpora',
            color_discrete_sequence=px.colors.qualitative.D3, width=550, height=450)
fig.update_layout(showlegend=False)
fig.show()

In [57]:
import random

print("\n".join(random.sample(uncommon_tokens_gen['large-762M.train.jsonl'], 20)))

dcplay
nacaid
hanlight
whereisgodaddy181
9n6hmm6jhua
legerly
acéloniques
pgps74
mechitarum
ragettebitch
izappel
concromer
izomin
caios
topontitelika
mistracker
neuralyzing
hyperfactual
gjorleifsson
foreknowledgenew



Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.



In [54]:
len(uncommon_tokens_gen['large-762M.train.jsonl'])

1422459

In [55]:
len(uncommon_tokens_real['large-762M.train.jsonl'])

432587

In [56]:
import random

print("\n".join(random.sample(uncommon_tokens_real['large-762M.train.jsonl'], 20)))

tinwe
colelction
senkungu
18740
74d1
opengrok
see5g
1331835584967
sensorclass
skypricker
charitar
hanci
amicizia
29449
needed10
shurlok
leafcrown
szblank
purecomputed
nannra



Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.



In [90]:
found = 0
for dataset in features:
    if 'nardelli' in features[dataset]:
        found += 1
print(f"Found in {found} dataset over {len(features)}")

Found in 11 dataset over 27


## Selective dataset analysis

In [11]:
ds_select = widgets.Dropdown(
    options = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]),
    #description='dataset to load:',
    disabled=False,
)
load_button = widgets.Button(
    description='load',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to load dataset',
    icon='download' # (FontAwesome names without the `fa-` prefix)
)
output = widgets.Output()

def load_ds(b):
    output.clear_output()
    with output:
        print("loading dataset...")
        path = os.path.join(data_path, ds_select.value)
        if os.path.exists(path):
            b.value = pd.read_json(path, lines = True)
            print(f"Dataset {ds_select.value} loaded")
        else:
            print(f"Path {path} for dataset {ds_select.value} does not exist!")

load_button.on_click(load_ds)


widgets.VBox([widgets.Label(value="Select dataset to load:"), widgets.HBox([ds_select, load_button]), output])

VBox(children=(Label(value='Select dataset to load:'), HBox(children=(Dropdown(options=('large-762M-k40.test.j…

In [42]:
df_dataset = load_button.value

In [75]:
dataset_name = "xl-1542M-k40.train.jsonl"

In [60]:
dataset_name = "webtext.train.jsonl"

In [76]:
df_dataset = pd.read_json(os.path.join(data_path, dataset_name), lines = True)

In [17]:
tokens.keys()

dict_keys(['large-762M-k40.test.jsonl', 'large-762M-k40.train.jsonl', 'large-762M-k40.valid.jsonl', 'large-762M.test.jsonl', 'large-762M.train.jsonl', 'large-762M.valid.jsonl', 'medium-345M-k40.test.jsonl', 'medium-345M-k40.train.jsonl', 'medium-345M-k40.valid.jsonl', 'medium-345M.test.jsonl', 'medium-345M.train.jsonl', 'medium-345M.valid.jsonl', 'small-117M-k40.test.jsonl', 'small-117M-k40.train.jsonl', 'small-117M-k40.valid.jsonl', 'small-117M.test.jsonl', 'small-117M.train.jsonl', 'small-117M.valid.jsonl', 'webtext.test.jsonl', 'webtext.train.jsonl', 'webtext.valid.jsonl', 'xl-1542M-k40.test.jsonl', 'xl-1542M-k40.train.jsonl', 'xl-1542M-k40.valid.jsonl', 'xl-1542M.test.jsonl', 'xl-1542M.train.jsonl', 'xl-1542M.valid.jsonl'])

In [15]:
df_dataset.iloc[5,1]

'The European railway sector is receiving a pummelling as British government eurosceptics seize on a report showing that the EU-IMF bailout crisis has hardened the continent\'s financial landscape. Because there is no EU budget obligation to fund train lines, EU funds have been diverted to other countries\' rail networks.\n\nAnd because there is no EU binding requirement to share cost better with cleaners, cleaning staff can now be filmed as they are sacked.\n\nTalks between lenders and national profiteering agencies are taking place in volume hoping to pick up the shortfall. Angry operators will protest against the unfairness of years of state handouts to companies that have secured rail contracts, and unions – already in crisis, though subsidised by taxpayers – will warn that their influence is pressured by politicians on all sides. December\'s European elections could change things.\n\nThe Olso Network in Finland – the nation\'s fourth major identity-regulated railway – narrowly esc

In [None]:
df_dataset.head()

In [77]:
eng_vocab_overlap = []
eng_ratio = []
non_eng_vocab_count = []
for i, tks in enumerate(tokens[dataset_name]):
    non_eng_vocabs = set()
    count = 0
    for tk in tks:
        if tk in eng_vocab:
            count += 1
        else:
            non_eng_vocabs.add(tk)
    eng_vocab_overlap.append(count)
    non_eng_vocab_count.append(len(non_eng_vocabs))
    if len(tks) == 0:
        eng_ratio.append(100)
        #print(df_dataset.iloc[i,1])
    else:
        eng_ratio.append(int(count*100/len(tks)))

#eng_vocab_overlap = [len(eng_vocab.intersection(set(word_tokenize(text)))) for text in texts]

In [35]:
from nltk.tokenize import word_tokenize

In [43]:
texts = df_dataset["text"].to_list()

In [56]:
eng_vocab_overlap = []
eng_ratio = []
zero_length_texts = []
for text in texts:
    tokenized = word_tokenize(text.lower())
    count = 0
    for tk in tokenized:
        if tk in eng_vocab:
            count += 1
    eng_vocab_overlap.append(count)
    if len(tokenized) == 0:
        eng_ratio.append(100)
        zero_length_texts.append(text)
    else:
        eng_ratio.append(int(count*100/len(tokenized)))

#eng_vocab_overlap = [len(eng_vocab.intersection(set(word_tokenize(text)))) for text in texts]

In [78]:
df_dataset["english words"] = eng_vocab_overlap
df_dataset["english %"] = eng_ratio
df_dataset["non-eng vocabs"] = non_eng_vocab_count #uniques

In [80]:
df_dataset[df_dataset["english %"]>95]

Unnamed: 0,id,text,length,ended,english words,english %,non-eng vocabs
0,1,"Cops will have to take ""extreme care"" to avoid...",433,True,334,97,7
1,2,The latest edition of the German Football Hall...,715,True,534,96,12
2,3,"Dangerous animals, especially snakes, in a car...",1024,False,852,99,1
3,4,I started my first MLP story way back in 2014 ...,1024,False,767,97,7
4,5,This article is a disambiguation page for The ...,84,True,69,100,0
...,...,...,...,...,...,...,...
249992,249993,"A Texas woman is suing Apple, claiming to have...",1024,False,807,96,7
249993,249994,Rise: The Dawn of Justice: $3-4 per copy\n\nRi...,874,True,322,100,0
249994,249995,"""You are like a child and you do not know what...",38,True,33,100,0
249995,249996,There are a lot of things that I don't like ab...,806,True,688,98,2


In [19]:
df_dataset[df_dataset["english %"]<10].to_csv("bad_samples.csv")

In [38]:
df_dataset.sort_values(by=["non-eng vocabs"], ascending=False)[:10000].to_csv("xl_vocabs_study.csv")

In [81]:
df_ge90eng = df_dataset[df_dataset["english %"]>=95]

In [82]:
df_ge90eng[df_ge90eng["english words"]>20]

Unnamed: 0,id,text,length,ended,english words,english %,non-eng vocabs
0,1,"Cops will have to take ""extreme care"" to avoid...",433,True,334,97,7
1,2,The latest edition of the German Football Hall...,715,True,534,96,12
2,3,"Dangerous animals, especially snakes, in a car...",1024,False,852,99,1
3,4,I started my first MLP story way back in 2014 ...,1024,False,767,97,7
4,5,This article is a disambiguation page for The ...,84,True,69,100,0
...,...,...,...,...,...,...,...
249993,249994,Rise: The Dawn of Justice: $3-4 per copy\n\nRi...,874,True,322,100,0
249994,249995,"""You are like a child and you do not know what...",38,True,33,100,0
249995,249996,There are a lot of things that I don't like ab...,806,True,688,98,2
249996,249997,A year after an unprecedented public outcry ag...,1024,False,776,95,6


In [71]:
df_ge90eng[df_ge90eng["english words"]>20].sample(n=100000).set_index("id")

Unnamed: 0_level_0,ended,length,text,english words,english %,non-eng vocabs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
113106,True,409,"Hartals are a familiar ritual in Kerala, but M...",296,96,7
82299,True,204,About our Arsenal Football Site\n\nWelcome to ...,166,97,4
169383,True,782,Definition\n\nUnder supervision and according ...,597,99,2
142626,True,661,Attention stoners: keep your weed away from yo...,498,97,9
47827,True,516,1-3/4-in x 36-in Silver Drip Cap Door Bottom i...,391,98,4
...,...,...,...,...,...,...
60406,True,212,"LOS ANGELES — David Litt, known as President O...",150,92,8
218165,True,762,Predicting a sharp and somewhat bewildering sh...,584,93,13
208074,True,348,The European Commission could look to take som...,282,99,2
100457,False,1024,"Last week marked the 30th anniversary of ""1984...",753,98,14


In [83]:
df_ge90eng[df_ge90eng["english words"]>20].sample(n=100000).set_index("id").to_csv(f"{dataset_name}.clean100k.csv")

In [45]:
df_brutti = df_dataset.sort_values(by=["non-eng vocabs"], ascending=False)[:10000]

In [47]:
df_brutti[df_brutti["ended"]==False]

Unnamed: 0,id,text,length,ended,english words,english %,non-eng vocabs
113150,113151,Description\n\nWelcome to Adventotaurs MSSC!\n...,1024,False,59,17,269
202110,202111,MeI2 na ispatik?\n\nHan ate!\n\nSokol hackata....,1024,False,216,45,207
94742,94743,MEI\n\n[Muzei web de legwacie zende bujicza ur...,1024,False,217,52,186
182845,182846,Progress! It's finally here! Angel this week! ...,1024,False,300,61,181
78840,78841,cup 260b\n\nLarge Tumbler:\n\n\nLarger size Fe...,1024,False,366,67,172
...,...,...,...,...,...,...,...
64878,64879,1 Clean Episode L24: We're Going to Play Saw T...,1024,False,657,90,45
76541,76542,Decorative Makarov refunds Obama for compromis...,1024,False,677,93,45
96547,96548,Partial transcripts for 1-3 pages above may be...,1024,False,632,92,45
249581,249582,Back when Smash was in its games development i...,1024,False,738,91,45


In [5]:
df = pd.read_csv("webtext.train.jsonl.clean100k.csv")

In [6]:
df

Unnamed: 0,id,ended,length,text,english words,english %,non-eng vocabs
0,172092,True,634,Everton goalkeeper Tim Howard has signed a con...,488,95,7
1,208655,True,264,That addresses performance. As for battery lif...,202,97,1
2,25537,False,1024,"Posted on February 18, 2014 by Bryan Ball\n\nB...",819,97,6
3,81025,True,335,Jasper Jolly\n\nhe US dollar has reached its h...,266,97,5
4,6966,True,48,I received a package containing food contents ...,41,100,0
...,...,...,...,...,...,...,...
99995,22089,False,1024,Who Pays\n\nAmbulance Billing\n\nThe Ministry ...,785,100,0
99996,146668,True,466,Description\n\nAutomatic Bottle Opener\n\n\nTh...,366,98,5
99997,100597,True,89,"We have created a bunch of 'how-to' videos, to...",66,95,2
99998,64251,True,72,$14.99\n\nEasily & securely mount your iPhone ...,42,95,2


# Uniform test datasets

In [4]:
import os
import pandas as pd
import pickle

In [9]:
output_path = "./data/test"

## GPT-2 vs WebText

In [37]:
path_prefix = "./data"
files = ['large-762M-k40.test.jsonl', 'large-762M.test.jsonl', 'medium-345M-k40.test.jsonl', 'medium-345M.test.jsonl', 'small-117M-k40.test.jsonl', 'small-117M.test.jsonl', 'webtext.test.jsonl', 'xl-1542M-k40.test.jsonl', 'xl-1542M.test.jsonl']

In [39]:
for file in files:
    path = os.path.join(path_prefix, file)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        raw_texts = df.text.to_list()
        if "webtext" in file:
            prefix = ""
            tp = "human"
        else:
            prefix = "gpt2."
            tp = "machine"
        with open(os.path.join(output_path, f"{prefix}{os.path.splitext(file)[0]}.{tp}.pickle"), "wb") as raw_file:
            pickle.dump(raw_texts, raw_file)

## GROVER

In [43]:
path_prefix = "./data/grover"
files = sorted([f for f in os.listdir(path_prefix) if os.path.isfile(os.path.join(path_prefix, f))])

In [42]:
for file in files:
    path = os.path.join(path_prefix, file)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        human_texts = df[df.label == "human"].article.to_list()
        machine_texts = df[df.label == "machine"].article.to_list()
        target = os.path.splitext(file)[0].replace("generator=","grover-").replace("~dataset=","-")
        with open(os.path.join(output_path, target+".human.pickle"), "wb") as raw_file:
            pickle.dump(human_texts, raw_file)
        with open(os.path.join(output_path, target+".machine.pickle"), "wb") as raw_file:
            pickle.dump(machine_texts, raw_file)

In [44]:
for file in files:
    path = os.path.join(path_prefix, file)
    if os.path.exists(path):
        df = pd.read_json(path, lines = True)
        break

## GPT-3

In [27]:
path_prefix = "./data"
file = "175b_samples.jsonl"

In [35]:
path = os.path.join(path_prefix, file)
if os.path.exists(path):
    df = pd.read_json(path, lines = True)
    raw_texts = df[df.columns[0]].to_list()
    with open(os.path.join(output_path, f"gpt3.{os.path.splitext(file)[0]}.machine.pickle"), "wb") as raw_file:
        pickle.dump(raw_texts, raw_file)