In [44]:
import os
import pickle
import pandas as pd
import numpy as np

Save path for evaluation reports

In [45]:
results_path = "./results"

# Load training data

Load pickled embedded training data

In [54]:
path_prefix = "./../../get_text_detect_space/datasets/embed/"
dataset_path = "webtext.train.jsonl.clean100k.csv+xl-1542M.train.jsonl.clean100k.csv_embed.pickle"

In [55]:
with open(os.path.join(path_prefix, dataset_path), "rb") as f:
    training_data = pickle.load(f)

X = training_data["X"]
y = training_data["y"]

In [59]:
X

array([[ 1.56034485e-01, -1.03202422e-03,  9.29313446e-02, ...,
        -6.20653304e-03,  1.78792963e-02, -1.16704679e-02],
       [ 1.19502099e-01,  1.64194560e-02, -1.96907246e-02, ...,
        -1.13290602e-02,  5.91688235e-03,  1.52714059e-03],
       [ 1.67067327e-01,  5.72166997e-02, -9.31411220e-03, ...,
        -6.56531715e-03, -1.25897509e-02,  2.12441978e-03],
       ...,
       [ 1.45809149e-01, -3.29254096e-02, -2.40912571e-02, ...,
         6.88818067e-03, -4.50552379e-03, -3.19229779e-03],
       [ 1.07385538e-01, -5.34839545e-02,  2.93454104e-02, ...,
        -1.70121954e-02, -6.23178832e-03,  8.76551081e-03],
       [ 1.83371955e-01,  9.32380781e-02, -1.46499623e-03, ...,
         1.69348226e-05, -6.23653770e-03,  8.20869394e-03]])

In [64]:
base_prefix_path = "./../../get_text_detect_space/datasets"
tfidf_path = f"{base_prefix_path}/embed/webtext.train.jsonl.clean100k.csv+xl-1542M.train.jsonl.clean100k.csv_embed.pickle"
pos_path = f"{base_prefix_path}/pos/embed/webtext_xl-1542M.pos_embed.pickle"

with open(tfidf_path, "rb") as f:
    training_data = pickle.load(f)

X = training_data["X"]
y = training_data["y"]

with open(pos_path, "rb") as f:
    training_data = pickle.load(f)
    
X = np.concatenate([X, training_data["X"]], axis=1)


In [102]:
X.shape

(200000, 600)

# Load test data

Load pickled embedded test data

In [92]:
path_prefix = "./../../get_text_detect_space/datasets/embed/"
dataset_names = ["webtext.test.human_embed.pickle", "gpt2.xl-1542M.test.machine_embed.pickle"]

In [93]:
with open(os.path.join(path_prefix, dataset_names[0]), "rb") as f:
    test_data = pickle.load(f)
    X_test = test_data["X"]
    y_test = test_data["y"]
if len(dataset_names)>1:
    for dataset_name in dataset_names[1:]:
        with open(os.path.join(path_prefix, dataset_name), "rb") as f:
            test_data = pickle.load(f)
            X_test = np.concatenate((X_test, test_data["X"]))
            y_test += test_data["y"]

In [94]:
path_prefix = "./../../get_text_detect_space/datasets/pos/embed/"
dataset_name = "webtext.test_xl-1542M.test.pos_embed.pickle"
with open(os.path.join(path_prefix, dataset_name), "rb") as f:
    test_data = pickle.load(f)
    X_test = np.concatenate([X_test, test_data["X"]], axis=1)

In [95]:
X_test

array([[ 0.036449  ,  0.01040318, -0.01484086, ...,  0.06093252,
        -0.01726509, -0.01650083],
       [ 0.1697334 ,  0.00916163,  0.08250503, ...,  0.00875937,
        -0.00962994, -0.00285187],
       [ 0.08371905, -0.0375891 ,  0.01756523, ...,  0.01225226,
         0.01454416, -0.02042341],
       ...,
       [ 0.16086878, -0.10212953, -0.00653402, ..., -0.00609007,
         0.00459895,  0.0048186 ],
       [ 0.14679124, -0.05596965,  0.00043219, ...,  0.01100336,
        -0.01325901,  0.01135516],
       [ 0.13941275, -0.04264188,  0.0272543 , ...,  0.00278548,
         0.00502804, -0.00166508]])

In [96]:
X_test.shape

(10000, 600)

# Train classifiers

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## GradientBoosting

In [80]:
from sklearn.ensemble import GradientBoostingClassifier

In [81]:
gboost = GradientBoostingClassifier(random_state=42)
parameters = {'n_estimators':[100, 200, 300], 'max_depth':[3, 4, 5]}
# parameters = {'n_estimators':[100, 200, 300], 'max_depth':[3, 4, 5], 'learning_rate':[0.2, 0.5, 1.0]}

clf_gboost = GridSearchCV(gboost, parameters, n_jobs=-1)

In [82]:
clf_gboost.fit(X, y)

Save cross-validation results

In [97]:
filename = 'clf_gboost_trained_on_tfidf-and-pos.sav'
pickle.dump(clf_gboost, open(filename, 'wb'))

In [100]:
df_clf_gboost = pd.DataFrame(clf_gboost.cv_results_)

df_clf_gboost.to_csv(os.path.join(results_path,"gboost_cv_tfidf-and-pos_results.csv"))
df_clf_gboost

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5251.148904,40.381404,0.333035,0.001426,3,100,"{'max_depth': 3, 'n_estimators': 100}",0.682625,0.682975,0.67975,0.6814,0.683775,0.682105,0.001404,9
1,10548.490164,97.220282,0.630662,0.018553,3,200,"{'max_depth': 3, 'n_estimators': 200}",0.700975,0.7007,0.697175,0.699025,0.702575,0.70009,0.001842,6
2,15381.641376,40.74176,0.506601,0.015179,3,300,"{'max_depth': 3, 'n_estimators': 300}",0.71045,0.710525,0.70595,0.70575,0.712925,0.70912,0.002815,4
3,7055.39769,37.817309,0.426139,0.000891,4,100,"{'max_depth': 4, 'n_estimators': 100}",0.692225,0.694675,0.6907,0.69025,0.693425,0.692255,0.001653,8
4,14067.782288,103.672379,0.783945,0.016438,4,200,"{'max_depth': 4, 'n_estimators': 200}",0.708925,0.707675,0.7058,0.706875,0.709575,0.70777,0.001363,5
5,19682.640307,75.392464,0.659557,0.016924,4,300,"{'max_depth': 4, 'n_estimators': 300}",0.716825,0.715125,0.713575,0.71645,0.71815,0.716025,0.001559,2
6,8799.906909,68.603176,0.527979,0.045454,5,100,"{'max_depth': 5, 'n_estimators': 100}",0.700025,0.698475,0.697125,0.6971,0.702425,0.69903,0.002009,7
7,15239.70991,452.647093,0.510447,0.026485,5,200,"{'max_depth': 5, 'n_estimators': 200}",0.7149,0.71325,0.71295,0.71375,0.71495,0.71396,0.000828,3
8,20994.867189,418.843968,0.722291,0.01363,5,300,"{'max_depth': 5, 'n_estimators': 300}",0.7235,0.720125,0.719425,0.720675,0.723175,0.72138,0.00165,1


Evaluate best-classifier on test data

In [101]:
y_predict = clf_gboost.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"gboost_test_tfidf-and-pos_results.csv"))
df_cr

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.713852,0.709199,0.7115,0.711526,0.711526
recall,0.706,0.717,0.7115,0.7115,0.7115
f1-score,0.709904,0.713078,0.7115,0.711491,0.711491
support,5000.0,5000.0,0.7115,10000.0,10000.0


## SVM

In [61]:
from sklearn.svm import SVC

In [62]:
svm = SVC(random_state=42)
#parameters = {'C':[0.5, 1.0, 2.0, 3.0], 'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3],
#             'coef0':[0.0, 0.5, 1.0]}
parameters = {'C':[0.5, 1.0, 2.0, 3.0]}

clf_svm = GridSearchCV(svm, parameters, n_jobs=-1)

In [None]:
clf_svm.fit(X, y)

Save cross-validation results

In [None]:
df_clf_svm = pd.DataFrame(clf_svm.cv_results_)

df_clf_svm.to_csv(os.path.join(results_path,"svm_cv_results.csv"))
df_clf_svm

Evaluate best-classifier on test data

In [None]:
y_predict = clf_svm.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"svm_test_results.csv"))
df_cr

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
parameters = {'n_estimators':[100, 200, 300], 'criterion':['gini', 'entropy', 'log_loss']}

clf_rf = GridSearchCV(rf, parameters, n_jobs=-1)

In [None]:
clf_rf.fit(X, y)

Save cross-validation results

In [None]:
df_clf_rf = pd.DataFrame(clf_rf.cv_results_)

df_clf_rf.to_csv(os.path.join(results_path,"randomforest_cv_results.csv"))
df_clf_rf

Evaluate best-classifier on test data

In [None]:
y_predict = clf_rf.predict(X_test)
df_cr = pd.DataFrame(classification_report(y_test, y_predict, output_dict=True))
df_cr.to_csv(os.path.join(results_path,"randomforest_test_results.csv"))
df_cr