In [None]:
import os
import pickle
import pandas as pd
import numpy as np

Save path for evaluation reports

In [None]:
results_path = "./results"

# Load training data

Load pickled embedded training data

In [None]:
path_prefix = "./data/embed"
dataset_path = "webtext.train.jsonl.clean100k.csv+xl-1542M.train.jsonl.clean100k.csv_embed.pickle"

In [None]:
with open(os.path.join(path_prefix, dataset_path), "rb") as f:
    training_data = pickle.load(f)

X = training_data["X"]
y = training_data["y"]

# Load test data

Load pickled embedded test data

In [None]:
path_prefix = "./data/embed"
dataset_names = ["webtext.test.human_embed.pickle", "gpt2.xl-1542M.test.machine_embed.pickle"]

In [None]:
with open(os.path.join(path_prefix, dataset_names[0]), "rb") as f:
    test_data = pickle.load(f)
    X_test = test_data["X"]
    y_test = test_data["y"]
with open(os.path.join(path_prefix, dataset_names[1]), "rb") as f:
    test_data = pickle.load(f)
    X_test = np.concatenate((X_test, test_data["X"]))
    y_test += test_data["y"]

# Train classifiers

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gboost = GradientBoostingClassifier(random_state=42)
parameters = {'n_estimators':[100, 200, 300], 'max_depth':[3, 4, 5], 'learning_rate':[0.2, 0.5, 1.0]}
parameters = {'n_estimators':[20, 50], 'max_depth':[1, 2], 'learning_rate':[1.0]}

clf_gboost = GridSearchCV(gboost, parameters)

In [None]:
clf_gboost.fit(X, y)

Save cross-validation results

In [None]:
df_clf_gboost = pd.DataFrame(clf_gboost.cv_results_)

df_clf_gboost.to_csv(os.path.join(results_path,"gboost_cv_results.csv"))
df_clf_gboost

Evaluate best-classifier on test data

In [None]:
y_predict = clf_gboost.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"gboost_test_results.csv"))
df_cr

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC(random_state=42)
parameters = {'C':[0.5, 1.0, 2.0, 3.0], 'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3],
             'coef0':[0.0, 0.5, 1.0]}

clf_svm = GridSearchCV(svm, parameters)

In [None]:
clf_svm.fit(X, y)

Save cross-validation results

In [None]:
df_clf_svm = pd.DataFrame(clf_svm.cv_results_)

df_clf_svm.to_csv(os.path.join(results_path,"svm_cv_results.csv"))
df_clf_svm

Evaluate best-classifier on test data

In [None]:
y_predict = clf_svm.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"svm_test_results.csv"))
df_cr

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
parameters = {'n_estimators':[100, 200, 300], 'criterion':['gini', 'entropy', 'log_loss']}

clf_rf = GridSearchCV(rf, parameters)

In [None]:
clf_rf.fit(X, y)

Save cross-validation results

In [None]:
df_clf_rf = pd.DataFrame(clf_rf.cv_results_)

df_clf_rf.to_csv(os.path.join(results_path,"randomforest_cv_results.csv"))
df_clf_rf

Evaluate best-classifier on test data

In [None]:
y_predict = clf_rf.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"randomforest_test_results.csv"))
df_cr