In [None]:
import json
import pandas as pd
import os
import ipywidgets as widgets

In [None]:
data_path = "./data"
datasets = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))])

# Load dataset

In [None]:
def make_dataset_selector():
    return widgets.Dropdown(
        options = sorted([f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]),
        disabled=False,
    )

ds_select_1 = make_dataset_selector()
ds_select_2 = make_dataset_selector()

load_button = widgets.Button(
    description='load',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to load dataset',
    icon='download' # (FontAwesome names without the `fa-` prefix)
)
output = widgets.Output()

def load_ds(b):
    output.clear_output()
    with output:
        path_1 = os.path.join(data_path, ds_select_1.value)
        path_2 = os.path.join(data_path, ds_select_2.value)
        if path_1 == path_2:
            print("Choose different datasets!")
        else:
            print("loading datasets...")
            if os.path.exists(path_1) and os.path.exists(path_2):
                b.value = (pd.read_json(path_1, lines = True), pd.read_json(path_2, lines = True))
                print(f"Datasets {ds_select_1.value} and {ds_select_2.value} loaded")
            else:
                print(f"Path does not exist!")

load_button.on_click(load_ds)


widgets.VBox([widgets.Label(value="Select datasets to load:"), 
              widgets.HBox([widgets.Label(value="Real text:"), ds_select_1]),
              widgets.HBox([widgets.Label(value="Fake text:"), ds_select_2]), load_button, output])

In [None]:
df_real = load_button.value[0]
df_fake = load_button.value[1]
corpus = df_real["text"].to_list() + df_fake["text"].to_list()
labels = [0 for _ in range(len(df_real))] + [1 for _ in range(len(df_fake))]

## Build vocabulary

In [None]:
from nlp_engine.preprocessing import transformers as tfs
from sklearn.pipeline import make_pipeline

In [None]:
tokenized_corpus = make_pipeline(
    tfs.WordTokenizer(), 
    tfs.WordsFilter(drop_symbols=True, drop_digits=True)
).fit_transform(corpus)

In [None]:
from nlp_engine.analysis import vocabulary

In [None]:
vocab_real = vocabulary.get_vocabulary(tokenized_corpus[:len(df_real)])
vocab_fake = vocabulary.get_vocabulary(tokenized_corpus[len(df_real):])

In [None]:
vocab_shared = vocab_real.intersection(vocab_fake)

## Build baseline classifier with TF-IDF embedding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline = make_pipeline(
    tfs.WordTokenizer(),
    tfs.WordsFilter(drop_symbols=True, drop_digits=True, whitelist=vocab_shared),
    TfidfVectorizer(ngram_range=(1,3), max_features=1000000, sublinear_tf=True, tokenizer=lambda x: x, preprocessor=lambda x: x),
    TruncatedSVD(n_components=600),
    RandomForestClassifier(n_estimators=200, n_jobs=-1)
)

In [None]:
pipeline.fit(corpus, labels)

Before running the following block, load the test datasets with the widget above and update corpus/labels with them

In [None]:
from sklearn.metrics import classification_report

labels_pred = pipeline.predict(corpus)
print(classification_report(labels, labels_pred))