In [1]:
import json
import pandas as pd
import os

In [30]:
data_path = "./"
model_path = "./"
embed_path = "./data/embed"

In [36]:
import pickle

def load_dataset(file_path):
    ds = None
    if os.path.splitext(file_path)[-1] == ".csv":
        ds = pd.read_csv(os.path.join(data_path, file_path))
    elif os.path.splitext(file_path)[-1] == ".jsonl":
        ds = pd.read_json(os.path.join(data_path, file_path), lines=True)
    elif os.path.splitext(file_path)[-1] == ".pickle":
        with open(file_path, "rb") as f:
            ds = pickle.load(f)
    return ds

# Words-embedding

## Load training datasets

In [37]:
real_data_file = 'webtext.train.jsonl.clean100k.csv'
fake_data_file = 'xl-1542M.train.jsonl.clean100k.csv'

In [38]:
df_real = load_dataset(real_data_file)
df_fake = load_dataset(fake_data_file)

corpus = df_real["text"].to_list() + df_fake["text"].to_list()
labels = [0 for _ in range(len(df_real))] + [1 for _ in range(len(df_fake))]

## Build vocabulary

In [7]:
from nlp_engine.preprocessing import transformers as tfs
from sklearn.pipeline import make_pipeline

In [8]:
tokenized_corpus = make_pipeline(
    tfs.WordTokenizer(), 
    tfs.WordsFilter(drop_symbols=False, drop_digits=True)
).fit_transform(corpus)

In [9]:
from nlp_engine.analysis import vocabulary

In [10]:
vocab_real = vocabulary.get_vocabulary(tokenized_corpus[:len(df_real)])
vocab_fake = vocabulary.get_vocabulary(tokenized_corpus[len(df_real):])

In [11]:
vocab_shared = vocab_real.intersection(vocab_fake)

In [12]:
print(f'Size of "real" vocabulary: {len(vocab_real)}\nSize of "fake" vocabulary: {len(vocab_fake)}\nSize of intersection: {len(vocab_shared)}')

Size of "real" vocabulary: 290187
Size of "fake" vocabulary: 511432
Size of intersection: 160741


## Compute TF-IDF embedding

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [14]:
def identity(x):
    return x

In [15]:
pipeline = make_pipeline(
    tfs.WordTokenizer(),
    tfs.WordsFilter(
        drop_symbols=False, 
        drop_digits=True, 
        whitelist=vocab_shared),
    TfidfVectorizer(
        ngram_range=(1,3), 
        max_features=1000000, 
        sublinear_tf=True, 
        tokenizer=identity, 
        preprocessor=identity),
    TruncatedSVD(n_components=300)
)

In [16]:
%%time
pipeline.fit(corpus, labels)



CPU times: user 24min 13s, sys: 8min 50s, total: 33min 4s
Wall time: 29min 43s


Transform training data and store it

In [18]:
corpus_embed = pipeline.transform(corpus)

In [20]:
import pickle

with open(os.path.join(embed_path,f"{real_data_file}+{fake_data_file}_embed.pickle"), "wb") as f:
    pickle.dump({"X": corpus_embed, "y": labels}, f)

## Embed test datasets

In [31]:
test_path = "./data/test"
test_dataset_files = sorted([f for f in os.listdir(test_path) if os.path.isfile(os.path.join(test_path, f))])

In [34]:
for test in test_dataset_files:
    with open(os.path.join(test_path, test), "rb") as f:
        X = pickle.load(f)
    X_embed = pipeline.transform(X)
    if "machine" in test:
        y = 1
    else:
        y = 0
    labels = [y for _ in range(len(X))]
    with open(os.path.join(embed_path,f"{os.path.splitext(test)[0]}_embed.pickle"), "wb") as f:
        pickle.dump({"X": X_embed, "y": labels}, f)