In [1]:
import pandas as pd

labels = ["human", "generated"]

with open("autextification2023/data/train/subtask_1/en/train.tsv", "r") as f:
    base_df = pd.read_csv(f, sep="\t", index_col=0)

    train_df = pd.DataFrame({"text": base_df["text"], "label": [0 if t == "human" else 1 for t in base_df["label"]]})

with open("autextification2023/data/test/subtask_1/en/test.tsv", "r") as f:
    base_df = pd.read_csv(f, sep="\t", index_col=0)

    test_df = pd.DataFrame({"text": base_df["text"], "label": [0 if t == "human" else 1 for t in base_df["label"]]})

base_df.head()

Unnamed: 0,id,prompt,text,label,model,domain
0,15725,NO-PROMPT,It has remained one of my favorite country/swi...,human,NO-MODEL,reviews
1,17108,NO-PROMPT,Even with very light use (hard to get motivate...,human,NO-MODEL,reviews
2,383,"His mother, Ivy Close, who won the first ever ...",She died in 2015 at age 93. She is survived by...,generated,A,news
3,7809,NO-PROMPT,Londonderry Crown Court heard how Heaney false...,human,NO-MODEL,news
4,6215,NO-PROMPT,"Will Genia, Lachie Turner and Berrick Barnes e...",human,NO-MODEL,news


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the text data to feature vectors
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

# Labels
y_train = train_df["label"]
y_test = test_df["label"]

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

# Initialize classifiers
clfs = [
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (RidgeClassifier(alpha=1.0, solver="sparse_cg"), "Ridge Classifier"),
    (KNeighborsClassifier(n_neighbors=100), "kNN"),
    (RandomForestClassifier(), "Random Forest"),
    # L2 penalty Linear SVC
    (LinearSVC(C=0.1, dual=False, max_iter=1000), "Linear SVC"),
    # L2 penalty Linear SGD
    (
        SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True),
        "Log-loss SGD",
    ),
    # NearestCentroid (aka Rocchio classifier)
    (NearestCentroid(), "Nearest Centroid"),
    # Sparse naive Bayes classifier
    (ComplementNB(alpha=0.1), "Complement Naive Bayes"),
    # Multi-layer perceptron classifier
    (MLPClassifier(solver="adam", alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42), "Multi-layer Perceptron"),
]

In [5]:
import time

from sklearn.metrics import accuracy_score

# Train the classifiers and measure accuracy
train_time = []
test_time = []
accuracy = []
for clf, name in clfs:
    start = time.perf_counter()
    clf.fit(X_train, y_train)
    train_time.append(time.perf_counter() - start)

    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    test_time.append(time.perf_counter() - start)

    accuracy.append(accuracy_score(y_test, y_pred))

# Create dataframe for report
results = pd.DataFrame({
    "classifier": [n for _, n in clfs],
    "training (sec)": train_time,
    "testing (sec)": test_time,
    "accuracy (%)": [100 * v for v in accuracy]
})
results.sort_values(by=["accuracy (%)"], ascending=False, ignore_index=True)

Unnamed: 0,classifier,training (sec),testing (sec),accuracy (%)
0,Logistic Regression,0.513332,0.001229,59.806706
1,Ridge Classifier,0.129249,0.001156,59.683034
2,Multi-layer Perceptron,79.490787,0.003688,59.458593
3,Linear SVC,0.166747,0.001199,58.867717
4,Log-loss SGD,0.062692,0.001121,58.469219
5,Random Forest,77.403,1.626189,57.612679
6,Complement Naive Bayes,0.009102,0.003487,56.847746
7,kNN,0.003413,11.569419,52.940638
8,Nearest Centroid,0.00951,0.004731,51.250458
