In [3]:
from ruzicka.Order2Verifier import Order2Verifier
from ruzicka.utilities import *
from ruzicka.score_shifting import ScoreShifter, _auc_c_at_1
from ruzicka.evaluation import pan_metrics
from ruzicka.BDIVerifier import BDIVerifier

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.base

import pan20_verif_evaluator

import ray

from typing import Union, Collection, Callable
import pandas as pd
import numpy as np

In [2]:
fit_train_df = pd.read_csv("fit_train.csv", index_col=0)
fit_train_df

Unnamed: 0,author,text
127,357927,"""The Badaman Zero System is finished."" Armada ..."
133,357927,"""Hn, he still doesn""t know my true power."" Bai..."
134,357927,"""After being attacked by the Great Horn, she""s..."
135,357927,"""A male."" Chloe added, readying her transforma..."
136,357927,The Field Marshall merely smiled. SailorStar9:...
...,...,...
95263,445964,"He lays her down onto their bed, his body hove..."
28675,878883,While the preacher droned on in the background...
42726,3370103,"Julie: And we""re cornered. Tink: Now, which on..."
30026,880574,"""Your making this seem like a computer game. L..."


In [3]:
fit_test_df = pd.read_csv("fit_test.csv", index_col=0)
fit_test_df

Unnamed: 0,author,text,test_against,gt
23617,1007124,tree or two as he landed with a thunderous cra...,1007124,1
11760,100947,Dilandau opened his mouth to say something alo...,100947,1
24166,101296,"""Oh good... You remembered..."" He said hateful...",101296,1
15928,101393,Six children stood in the middle of the street...,101393,1
22992,1018526,"Fenrich straightened as the man approached, hi...",1018526,1
...,...,...,...,...
98429,367948,Yet another bright flash followed. This time t...,5477371,0
104217,4731033,"""She came to me in a dream she said after I ta...",83043,0
100587,9506334,"""Ok, sure"" I said, walking back over to him, a...",2750705,0
35184,2561263,"For now, his only option was to try to manuall...",3370103,0


In [6]:
verif_train_df = pd.read_csv("verif_train.csv", index_col=0)
verif_train_df

Unnamed: 0,author,text
1,1404943,"Zazuki nodded his head and got to his feet, kn..."
5,2263880,When Mello thought that his fingers are well c...
20,2940078,Whether Thranduil would recover from his drunk...
31,533081,A Strange Vampire Chapter One: Morning Convers...
32,734939,"""Hi,"" she said, equally as reservedly. This wa..."
...,...,...
25858,1291365,"""Hey! Where""d you go?"" said the boy, ""Come out..."
21453,5225909,But as she rounds a few trees and makes her wa...
11269,2086345,Ron parents were getting concerned Ron had spe...
9539,6353556,"""No, you won""t!"" Ashila then looked past the C..."


In [4]:
verif_test_df = pd.read_csv("verif_test.csv", index_col=0)
verif_test_df

Unnamed: 0,author,text,test_against,gt
25943,1000010,"""I""ve, yeah, I mean-"" The hand rubbed on Kotet...",1000010,1
4755,1002886,"By five, I was turning off my search engines a...",1002886,1
21283,100296,throwing her arms around Spiderman. We hear th...,100296,1
17509,1003238,Me- Sighing with relief. And that is the end o...,1003238,1
39166,1003416,"The doctor calls his last action a ""gamble."" H...",1003416,1
...,...,...,...,...
2356,2777842,Ashley flew as silent as a ghost as she patrol...,187706,0
30724,4432140,"""Oh yes, they""re both here. The salarian is in...",3348374,0
32669,1027418,T: nope nothing at all Me: all done I stand up...,246908,0
36912,1257406,"good-bye and grasped Jenna""s hand and led her ...",1369478,0


In [9]:
verif_train_df.author.value_counts().value_counts()

1     9143
5      905
11      94
19       7
29       3
Name: author, dtype: int64

In [10]:
verif_test_df.author.value_counts().value_counts()

1    10152
Name: author, dtype: int64

In [6]:
def pan21_overall(predicted, gt: Collection[float]) -> float:
    return pan20_verif_evaluator.evaluate_all(gt, predicted)["overall"]

In [7]:
from sklearn.metrics import precision_score
from pan20_verif_evaluator import binarize


def precision(true_y, pred_y):
    """
    Assesses verification performance, assuming that every
    `score > 0.5` represents a same-author pair decision.
    Note that all non-decisions (scores == 0.5) are ignored
    by this metric.

    Parameters
    ----------
    prediction_scores : array [n_problems]

        The predictions outputted by a verification system.
        Assumes `0 >= prediction <=1`.

    ground_truth_scores : array [n_problems]

        The gold annotations provided for each problem.
        Will typically be `0` or `1`.

    Returns
    ----------
    acc = The number of correct attributions.

    References
    ----------
        E. Stamatatos, et al. Overview of the Author Identification
        Task at PAN 2014. CLEF (Working Notes) 2014: 877-897.
    """
    true_y_filtered, pred_y_filtered = [], []

    for true, pred in zip(true_y, pred_y):
        if pred != 0.5:
            true_y_filtered.append(true)
            pred_y_filtered.append(pred)

    pred_y_filtered = binarize(pred_y_filtered)

    return precision_score(true_y_filtered, pred_y_filtered)

In [8]:
@ray.remote
def evaluate(test_X, train_X, test_df, train_df, verifier, shifter, imposters=30):
    train_y = np.array(train_df["author"])
    test_y = np.array(test_df["test_against"])
    verifier.fit(train_X, train_y)
    test_scores = verifier.predict_proba(
        test_X=test_X, test_y=np.array(test_y), nb_imposters=imposters
    )
    gt_scores = np.array(test_df["gt"])
    if shifter == "fit":
        shifter = ScoreShifter(min=0.053, max=0.947, min_spread=0.0)
        shifter.fit(
            predicted_scores=test_scores,
            ground_truth_scores=gt_scores,
            obj_func=pan21_overall,
        )
    unanswered = (
        (shifter.optimal_p1 < test_scores) & (test_scores < shifter.optimal_p2)
    ).sum()
    high_conf = ((0.055 > test_scores) | (test_scores > 0.945)).sum()
    test_scores = np.array(shifter.transform(test_scores))
    fps = ((test_scores - gt_scores) > 0.5).sum()
    pan_dict = pan20_verif_evaluator.evaluate_all(gt_scores, test_scores)

    return (
        {
            "tests": len(test_scores),
            "unanswed": unanswered,
            "high_conf": high_conf,
            "p1": shifter.optimal_p1,
            "p2": shifter.optimal_p2,
            "fp": fps,
            "prec": precision(gt_scores, test_scores),
        }
        | pan_dict,
        (test_scores, gt_scores),
    )

In [7]:
vectorizer_24 = make_pipeline(
    TfidfVectorizer(
        sublinear_tf=True,
        use_idf=False,
        norm="l2",
        analyzer="char",
        ngram_range=(2, 4),
        max_features=10000,
    ),
    StandardScaler(with_mean=False),
    FunctionTransformer(lambda x: np.asarray(x.todense()), accept_sparse=True),
)

vectorizer_25 = make_pipeline(
    TfidfVectorizer(
        sublinear_tf=True,
        use_idf=False,
        norm="l2",
        analyzer="char",
        ngram_range=(2, 5),
        max_features=10000,
    ),
    StandardScaler(with_mean=False),
    FunctionTransformer(lambda x: np.asarray(x.todense()), accept_sparse=True),
)

#
# This takes a long time
#
# vectorizer_24.fit(fit_train_df["text"])
# vectorizer_25.fit(fit_train_df["text"])
# vecs_fit = [
#     (
#         "2,3,4-grams",
#         vectorizer_24.transform(fit_train_df["text"]),
#         vectorizer_24.transform(fit_test_df["text"]),
#     ),
#     (
#         "2,3,4,5-grams",
#         vectorizer_25.transform(fit_train_df["text"]),
#         vectorizer_25.transform(fit_test_df["text"]),
#     ),
# ]
# vecs_verif = [
#     (
#         "2,3,4-grams",
#         vectorizer_24.transform(verif_train_df["text"]),
#         vectorizer_24.transform(verif_test_df["text"]),
#     ),
#     (
#         "2,3,4,5-grams",
#         vectorizer_25.transform(verif_train_df["text"]),
#         vectorizer_25.transform(verif_test_df["text"]),
#     ),
# ]

In [8]:
# pd.DataFrame(vecs_fit[0][1]).to_csv("234_fit_train.csv")
# pd.DataFrame(vecs_fit[0][2]).to_csv("234_fit_test.csv")
# pd.DataFrame(vecs_verif[0][1]).to_csv("234_verif_train.csv")
# pd.DataFrame(vecs_verif[0][2]).to_csv("234_verif_test.csv")
# pd.DataFrame(vecs_fit[1][1]).to_csv("2345_fit_train.csv")
# pd.DataFrame(vecs_fit[1][2]).to_csv("2345_fit_test.csv")
# pd.DataFrame(vecs_verif[1][1]).to_csv("2345_verif_train.csv")
# pd.DataFrame(vecs_verif[1][2]).to_csv("2345_verif_test.csv")

In [17]:
o2v_mm_param = {
    "metric": "minmax",
    "base": "instance",
    "rank": True,
    "nb_bootstrap_iter": 500,
    "rnd_prop": 0.5,
}
o2v_cs_param = {
    "metric": "cosine",
    "base": "instance",
    "rank": True,
    "nb_bootstrap_iter": 500,
    "rnd_prop": 0.5,
}

bdi_mm_param = {"metric": "minmax", "nb_bootstrap_iter": 500, "rnd_prop": 0.33}
bdi_cs_param = {"metric": "cosine", "nb_bootstrap_iter": 500, "rnd_prop": 0.33}

classifs = [
    ("Kestemont GI, Minmax", Order2Verifier, o2v_mm_param),
    ("Kestemont GI, Cosine", Order2Verifier, o2v_cs_param),
    ("BDI, Minmax", BDIVerifier, bdi_mm_param),
    ("BDI, Cosine", BDIVerifier, bdi_cs_param),
]

In [11]:
ftrx234 = ray.put(np.array(pd.read_csv("234_fit_train.csv", index_col=0)))
ftsx234 = ray.put(np.array(pd.read_csv("234_fit_test.csv", index_col=0)))

ftrx2345 = ray.put(np.array(pd.read_csv("2345_fit_train.csv", index_col=0)))
ftsx2345 = ray.put(np.array(pd.read_csv("2345_fit_test.csv", index_col=0)))

# vtrx234 = (ray.put(np.array(pd.read_csv("234_verif_train.csv", index_col=0))))
# vtsx234 = (ray.put(np.array(pd.read_csv("234_verif_train.csv", index_col=0))))

# vtrx2345 = (ray.put(np.array(pd.read_csv("2345_verif_train.csv", index_col=0))))
# vtsx2345 = (ray.put(np.array(pd.read_csv("2345_verif_train.csv", index_col=0))))

2024-05-22 12:16:05,269	INFO worker.py:1749 -- Started a local Ray instance.


In [12]:
vecs_fit = [
    (
        "2,3,4-grams",
        ftrx234,
        ftsx234,
    ),
    (
        "2,3,4,5-grams",
        ftrx2345,
        ftsx2345,
    ),
]

In [18]:
remotes = []
labels = []
rs = np.random.RandomState(42)

for classif_name, classifier, params in classifs:
    for vec_name, train_X, test_X in vecs_fit:

        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                fit_test_df.drop("text", axis=1),
                fit_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                ScoreShifter().manual_fit(0.11, 0.89),
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name} + manual"})

        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                fit_test_df.drop("text", axis=1),
                fit_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                "fit",
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name} + fitted"})

remotes_res = ray.get(remotes)

In [19]:
fit_results = pd.DataFrame([a[0] | b for a, b in zip(remotes_res, labels)])
fit_results

Unnamed: 0,tests,unanswed,high_conf,p1,p2,fp,prec,auc,c@1,f_05_u,F1,brier,overall,classifier
0,1000,338,476,0.11,0.89,6,0.984043,0.924,0.858,0.831,0.972,0.876,0.892,"Kestemont GI, Minmax + 2,3,4-grams + manual"
1,1000,132,478,0.31226,0.74138,12,0.971292,0.95,0.924,0.902,0.94,0.914,0.926,"Kestemont GI, Minmax + 2,3,4-grams + fitted"
2,1000,328,482,0.11,0.89,6,0.983957,0.919,0.857,0.831,0.965,0.875,0.889,"Kestemont GI, Minmax + 2,3,4,5-grams + manual"
3,1000,104,484,0.39272,0.78608,11,0.972772,0.943,0.915,0.906,0.921,0.912,0.92,"Kestemont GI, Minmax + 2,3,4,5-grams + fitted"
4,1000,346,470,0.11,0.89,7,0.980663,0.916,0.847,0.819,0.966,0.873,0.884,"Kestemont GI, Cosine + 2,3,4-grams + manual"
5,1000,130,467,0.31226,0.74138,14,0.966102,0.947,0.921,0.898,0.936,0.911,0.923,"Kestemont GI, Cosine + 2,3,4-grams + fitted"
6,1000,328,482,0.11,0.89,7,0.981233,0.92,0.857,0.83,0.964,0.875,0.889,"Kestemont GI, Cosine + 2,3,4,5-grams + manual"
7,1000,119,480,0.31226,0.77714,14,0.965854,0.944,0.913,0.898,0.924,0.909,0.918,"Kestemont GI, Cosine + 2,3,4,5-grams + fitted"
8,1000,149,796,0.11,0.89,8,0.979167,0.948,0.919,0.894,0.936,0.914,0.922,"BDI, Minmax + 2,3,4-grams + manual"
9,1000,120,795,0.12452,0.7235,11,0.972973,0.951,0.924,0.905,0.935,0.918,0.927,"BDI, Minmax + 2,3,4-grams + fitted"


In [20]:
fit_results.to_csv("fit_results.csv")

In [9]:
fit_results = pd.read_csv("fit_results.csv", index_col=0)
fit_results

Unnamed: 0,tests,unanswed,high_conf,p1,p2,fp,prec,auc,c@1,f_05_u,F1,brier,overall,classifier
0,1000,338,476,0.11,0.89,6,0.984043,0.924,0.858,0.831,0.972,0.876,0.892,"Kestemont GI, Minmax + 2,3,4-grams + manual"
1,1000,132,478,0.31226,0.74138,12,0.971292,0.95,0.924,0.902,0.94,0.914,0.926,"Kestemont GI, Minmax + 2,3,4-grams + fitted"
2,1000,328,482,0.11,0.89,6,0.983957,0.919,0.857,0.831,0.965,0.875,0.889,"Kestemont GI, Minmax + 2,3,4,5-grams + manual"
3,1000,104,484,0.39272,0.78608,11,0.972772,0.943,0.915,0.906,0.921,0.912,0.92,"Kestemont GI, Minmax + 2,3,4,5-grams + fitted"
4,1000,346,470,0.11,0.89,7,0.980663,0.916,0.847,0.819,0.966,0.873,0.884,"Kestemont GI, Cosine + 2,3,4-grams + manual"
5,1000,130,467,0.31226,0.74138,14,0.966102,0.947,0.921,0.898,0.936,0.911,0.923,"Kestemont GI, Cosine + 2,3,4-grams + fitted"
6,1000,328,482,0.11,0.89,7,0.981233,0.92,0.857,0.83,0.964,0.875,0.889,"Kestemont GI, Cosine + 2,3,4,5-grams + manual"
7,1000,119,480,0.31226,0.77714,14,0.965854,0.944,0.913,0.898,0.924,0.909,0.918,"Kestemont GI, Cosine + 2,3,4,5-grams + fitted"
8,1000,149,796,0.11,0.89,8,0.979167,0.948,0.919,0.894,0.936,0.914,0.922,"BDI, Minmax + 2,3,4-grams + manual"
9,1000,120,795,0.12452,0.7235,11,0.972973,0.951,0.924,0.905,0.935,0.918,0.927,"BDI, Minmax + 2,3,4-grams + fitted"


In [11]:
vtrx234 = ray.put(np.array(pd.read_csv("234_verif_train.csv", index_col=0)))
vtsx234 = ray.put(np.array(pd.read_csv("234_verif_test.csv", index_col=0)))

# vtrx2345 = (ray.put(np.array(pd.read_csv("2345_verif_train.csv", index_col=0))))
# vtsx2345 = (ray.put(np.array(pd.read_csv("2345_verif_train.csv", index_col=0))))

vecs_verif = [
    (
        "2,3,4-grams",
        vtrx234,
        vtsx234,
    ),
    # (
    #     "2,3,4,5-grams",
    #     vtrx2345,
    #     vtsx2345,
    # ),
]

2024-06-09 09:53:23,351	INFO worker.py:1749 -- Started a local Ray instance.


In [22]:
# cosine underperforms and the full verify is a largeboi
classifs = [
    ("Kestemont GI, Minmax", Order2Verifier, o2v_mm_param),
    # ("Kestemont GI, Cosine", Order2Verifier, o2v_cs_param),
    ("BDI, Minmax", BDIVerifier, bdi_mm_param),
    # ("BDI, Cosine", BDIVerifier, bdi_cs_param),
]

In [26]:
remotes = []
labels = []
rs = np.random.RandomState(42)

for classif_name, classifier, params in classifs:
    for vec_name, train_X, test_X in vecs_verif:

        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                verif_test_df.drop("text", axis=1),
                verif_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                ScoreShifter().manual_fit(0.11, 0.89),
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name} + manual"})

        fit_label = f"{classif_name} + {vec_name} + fitted"
        p1 = fit_results[fit_results.classifier == fit_label]["p1"].iloc[0]
        p2 = fit_results[fit_results.classifier == fit_label]["p2"].iloc[0]
        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                verif_test_df.drop("text", axis=1),
                verif_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                ScoreShifter().manual_fit(p1, p2),
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name} + fitted"})

remotes_res = ray.get(remotes)

[36m(raylet)[0m Spilled 2717 MiB, 14 objects, write throughput 1965 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


In [27]:
result_df = pd.DataFrame([a[0] | b for a, b in zip(remotes_res, labels)])

In [28]:
result_df.sort_values(by="overall", ascending=False)

Unnamed: 0,tests,unanswed,high_conf,p1,p2,fp,prec,auc,c@1,f_05_u,F1,brier,overall,classifier
3,10152,1326,7741,0.12452,0.7235,89,0.974637,0.922,0.881,0.872,0.882,0.885,0.889,"BDI, Minmax + 2,3,4-grams + fitted"
1,10152,1385,4432,0.31226,0.74138,89,0.973992,0.923,0.874,0.865,0.874,0.888,0.885,"Kestemont GI, Minmax + 2,3,4-grams + fitted"
2,10152,1815,7750,0.11,0.89,50,0.983729,0.918,0.87,0.843,0.877,0.878,0.877,"BDI, Minmax + 2,3,4-grams + manual"
0,10152,3578,4415,0.11,0.89,37,0.98699,0.892,0.82,0.774,0.931,0.86,0.855,"Kestemont GI, Minmax + 2,3,4-grams + manual"


In [29]:
result_df.to_csv("full_test.csv")

In [30]:
remotes = []
labels = []
rs = np.random.RandomState(42)

for classif_name, classifier, params in classifs:
    for vec_name, train_X, test_X in vecs_verif:
        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                verif_test_df.drop("text", axis=1),
                verif_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                "fit",
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name} + cheat-fitted"})

remotes_res = ray.get(remotes)

In [31]:
result_df = pd.DataFrame([a[0] | b for a, b in zip(remotes_res, labels)])

In [32]:
result_df.sort_values(by="overall", ascending=False)

Unnamed: 0,tests,unanswed,high_conf,p1,p2,fp,prec,auc,c@1,f_05_u,F1,brier,overall,classifier
1,10152,1128,7721,0.053,0.39272,236,0.942551,0.924,0.889,0.876,0.896,0.892,0.896,"BDI, Minmax + 2,3,4-grams + cheat-fitted"
0,10152,1433,4415,0.19604,0.52682,244,0.939694,0.924,0.883,0.863,0.898,0.891,0.892,"Kestemont GI, Minmax + 2,3,4-grams + cheat-fitted"


In [33]:
result_df.to_csv("results_cheat.csv")

In [10]:
bdi_mmc_param = {
    "metric": "minmax",
    "nb_bootstrap_iter": 500,
    "rnd_prop": 0.33,
    "method": "closest",
}

o2v_mmu_param = {
    "metric": "minmax",
    "base": "instance",
    "rank": False,
    "nb_bootstrap_iter": 500,
    "rnd_prop": 0.5,
}

classifs = [
    ("BDI, Minmax", BDIVerifier, bdi_mmc_param),
    ("Kestemont GI, Minmax", Order2Verifier, o2v_mmu_param),
]

In [12]:
remotes = []
labels = []
rs = np.random.RandomState(42)

for classif_name, classifier, params in classifs:
    for vec_name, train_X, test_X in vecs_verif:

        fit_label = f"{classif_name} + {vec_name} + fitted"
        print(fit_label)
        p1 = fit_results[fit_results.classifier == fit_label]["p1"].iloc[0]
        p2 = fit_results[fit_results.classifier == fit_label]["p2"].iloc[0]

        remotes.append(
            evaluate.remote(
                test_X,
                train_X,
                verif_test_df.drop("text", axis=1),
                verif_train_df.drop("text", axis=1),
                classifier(**params | {"random_state": rs.randint(2**32 - 1)}),
                ScoreShifter().manual_fit(p1, p2),
                imposters=30,
            )
        )
        labels.append({"classifier": f"{classif_name} + {vec_name}"})

remotes_res = ray.get(remotes)

BDI, Minmax + 2,3,4-grams + fitted
Kestemont GI, Minmax + 2,3,4-grams + fitted


In [13]:
result_df = pd.DataFrame([a[0] | b for a, b in zip(remotes_res, labels)])

In [14]:
result_df.sort_values(by="overall", ascending=False)

Unnamed: 0,tests,unanswed,high_conf,p1,p2,fp,prec,auc,c@1,f_05_u,F1,brier,overall,classifier
0,10152,1469,7383,0.12452,0.7235,53,0.982497,0.915,0.859,0.847,0.848,0.868,0.867,"BDI, Minmax + 2,3,4-grams"
1,10152,940,7396,0.31226,0.74138,53,0.982351,0.917,0.841,0.855,0.809,0.858,0.856,"Kestemont GI, Minmax + 2,3,4-grams"


In [15]:
result_df.to_csv("unranked_test.csv")