In [1]:
import numpy as np
from datasets import load_dataset

# Dataset of victorian era sentences of different authors. Downloads from huggingface in parquet format 
data = load_dataset('contemmcm/victorian_authorship')

In [2]:
from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from copy import deepcopy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, top_k_accuracy_score
from tqdm.auto import tqdm
from sklearn.ensemble import RandomForestClassifier
import pandas as pd


# Performs basic classification metrics for multi label classification. Return as dict to build dataframe later.
def perform_metric(y_true, y_pred):
    return {
        "acc": accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=np.nan, average='macro'),
        'recall': recall_score(y_true, y_pred, zero_division=np.nan, average='macro'),
        'f1': f1_score(y_true, y_pred, zero_division=np.nan, average='macro')
    }


# Finds human-readable name for a model. KNN adds number of neighbours to name
def get_name(model):
    if isinstance(model, KNeighborsClassifier):
        return f'{model.__class__.__name__}({str(model.n_neighbors)})'
    return model.__class__.__name__


#Runs train, evaluate pipeline on model zoo
def train_model(X_train, y_train, X_test, y_test):
    # Building zoo. Trying to reduce dimension by PCA with different number of components.
    components_zoo = [None, 50, 400]
    classifiers_zoo = [
        RidgeClassifierCV(),
        # Ridge classifier with cross-validation. Replaces classification task with regression one.  
        make_pipeline(StandardScaler(), LogisticRegressionCV()),
        # logistic regression. Works well only with scaled data.
        KNeighborsClassifier(50),  #KNN
        KNeighborsClassifier(100),  #KNN
        RandomForestClassifier(n_estimators=50),  # Select 50 to try to avoid overfit
        RadiusNeighborsClassifier(metric='cosine')  # Non-cosine works bad 
    ]
    zoo = []
    handles = []  # Names for index for resulting DF.

    for component in components_zoo:
        for model in classifiers_zoo:
            if component is None:  # No PCA at all. 
                zoo.append(deepcopy(model))
                handles.append(get_name(model))
            else:
                zoo.append(make_pipeline(PCA(component), model))  # Chain with PCA
                handles.append('PCA+' + get_name(model))

    train_results = []
    test_results = []

    for model in tqdm(zoo[len(train_results):]):
        model.fit(X_train, y_train)  # Not use fit_predict because not all classifiers support. 
        y_train_hat = model.predict(X_train)  # train preds
        y_test_hat = model.predict(X_test)  # test preds

        train_results.append(perform_metric(y_train, y_train_hat))
        test_results.append(perform_metric(y_test, y_test_hat))

    train_df = pd.DataFrame(train_results,
                            index=handles[:len(train_results)])  # Build train df. For consistency truncate handles
    test_df = pd.DataFrame(test_results, index=handles[:len(test_results)])  # Build test df

    return train_df, test_df

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Will use weighted inverse frequencies as main feature

def prepare_data(train_split, test_split):
    vectorizer = TfidfVectorizer(max_features=1000, strip_accents='ascii', stop_words='english')
    # max features for speed reasons. remove accents for different encodings. remove stop words to potentially make features more useful 
    X_train, y_train = vectorizer.fit_transform(train_split['text']).toarray(), train_split['author']
    # Will use already fitted vocab
    X_test, y_test = vectorizer.transform(test_split['text']).toarray(), test_split['author']
    # Assert shapes match
    assert X_train.shape[1] == X_test.shape[1]
    return X_train, y_train, X_test, y_test

In [4]:
X_train, y_train, X_test, y_test = prepare_data(data['train'].shuffle().select(range(10000)),
                                                data['test'].shuffle().select(range(10000)))
# Data is to big. Taking subset
victorian_train, victorian_test = train_model(X_train, y_train, X_test, y_test)

  0%|          | 0/18 [00:00<?, ?it/s]

In [5]:
victorian_train

Unnamed: 0,acc,precision,recall,f1
RidgeClassifierCV,0.8751,0.93905,0.762132,0.816268
Pipeline,1.0,1.0,1.0,1.0
KNeighborsClassifier(50),0.5983,0.772475,0.445178,0.510188
KNeighborsClassifier(100),0.5438,0.772335,0.364433,0.420983
RandomForestClassifier,1.0,1.0,1.0,1.0
RadiusNeighborsClassifier,0.1268,0.1268,0.022222,0.005001
PCA+RidgeClassifierCV,0.4568,0.571032,0.236902,0.231372
PCA+Pipeline,0.6895,0.660927,0.610432,0.630352
PCA+KNeighborsClassifier(50),0.5902,0.664767,0.427933,0.463397
PCA+KNeighborsClassifier(100),0.5481,0.653939,0.363843,0.39164


In [6]:
victorian_test

Unnamed: 0,acc,precision,recall,f1
RidgeClassifierCV,0.4051,0.497728,0.391363,0.338946
Pipeline,0.4708,0.491955,0.534204,0.45556
KNeighborsClassifier(50),0.2871,0.33373,0.253196,0.218277
KNeighborsClassifier(100),0.276,0.370974,0.229875,0.199281
RandomForestClassifier,0.2818,0.43457,0.21647,0.185415
RadiusNeighborsClassifier,0.0857,0.0857,0.02,0.003157
PCA+RidgeClassifierCV,0.2468,0.348774,0.174903,0.117576
PCA+Pipeline,0.3353,0.28926,0.34325,0.279005
PCA+KNeighborsClassifier(50),0.2826,0.278198,0.251264,0.20184
PCA+KNeighborsClassifier(100),0.2694,0.28921,0.225047,0.175016


In [14]:
print(f'Best test F1: {victorian_test["f1"].max()}')

Best test F1: 0.4555601859295214


In [7]:
# Blogs or comments. Should be harder 
blogs = load_dataset('night12/authorTextIdentification')
# Renaming for consistency
blogs = blogs.rename_column('author_id', 'author')

In [8]:
X_train, y_train, X_test, y_test = prepare_data(blogs['train'].shuffle().select(range(10000)),
                                                blogs['validation'].shuffle().select(range(6000)))

blogs_train, blogs_test = train_model(X_train, y_train, X_test, y_test)

  0%|          | 0/18 [00:00<?, ?it/s]

In [9]:
blogs_train

Unnamed: 0,acc,precision,recall,f1
RidgeClassifierCV,0.5759,0.688941,0.541132,0.561134
Pipeline,0.8679,0.890079,0.863907,0.875507
KNeighborsClassifier(50),0.3525,0.431709,0.318376,0.31586
KNeighborsClassifier(100),0.3378,0.418981,0.302403,0.284492
RandomForestClassifier,0.9987,0.99891,0.998503,0.998696
RadiusNeighborsClassifier,0.0626,0.0626,0.02,0.002356
PCA+RidgeClassifierCV,0.2617,0.32048,0.222891,0.176193
PCA+Pipeline,0.3598,0.356223,0.34071,0.338399
PCA+KNeighborsClassifier(50),0.2928,0.33511,0.258804,0.258873
PCA+KNeighborsClassifier(100),0.2672,0.314747,0.231351,0.229599


In [10]:
blogs_test

Unnamed: 0,acc,precision,recall,f1
RidgeClassifierCV,0.385333,0.399766,0.352311,0.333676
Pipeline,0.3975,0.41118,0.377664,0.387061
KNeighborsClassifier(50),0.2955,0.319091,0.267718,0.255073
KNeighborsClassifier(100),0.307833,0.344501,0.278031,0.255043
RandomForestClassifier,0.335167,0.326375,0.302687,0.297066
RadiusNeighborsClassifier,0.061333,0.061333,0.02,0.002312
PCA+RidgeClassifierCV,0.250333,0.265263,0.213098,0.164331
PCA+Pipeline,0.304667,0.288641,0.2843,0.276321
PCA+KNeighborsClassifier(50),0.240167,0.252337,0.210469,0.206915
PCA+KNeighborsClassifier(100),0.2415,0.281265,0.207025,0.204404


In [13]:
print(f'Best test F1: {blogs_test["f1"].max()}')

Best test F1: 0.38706098588229504
