In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

In [106]:
def build_corpus(truth_fn, text_fn: str) -> pd.DataFrame:
    try:
        # jasonl looks like this:
        # {"id": "c28e8b03-c02a-5184-b58a-12dd28b8ca74", "same": true, "authors": ["2072507", "2072507"]}
        # We need to split the two authors into columns because they match the texts in the next part
        tmp_df = pd.read_json(truth_fn, lines=True)
        truth_df = pd.DataFrame(
            tmp_df.authors.to_list(), index=tmp_df.index, columns=["a1", "a2"]
        )
    except Exception as e:
        traceback.print_exception(e)
        raise ValueError(f"Couldn't build truth df from filename {truth_fn}: {e}")

    try:
        # jsonl looks like this:
        # {"id": "c28e8b03-c02a-5184-b58a-12dd28b8ca74", "fandoms": ["Death Note", "Thor"], "pair": ["talk because they hadn\"t ...
        # Split the texts into individual columns as above. Not using the fandoms.
        tmp_series = pd.read_json(text_fn, lines=True)["pair"]  # a series not a df
        text_df = pd.DataFrame(
            tmp_series.to_list(), index=tmp_series.index, columns=["t1", "t2"]
        )
    except Exception as e:
        traceback.print_exception(e)
        raise ValueError(f"Couldn't build text df from filename {text_fn}: {e}")

    # Now just melt (wide to long) the authors and columns then join them to make a two column df
    final = pd.concat(
        [
            truth_df.melt(value_name="author"),
            text_df.melt(value_name="text"),
        ],
        axis=1,
    )[["author", "text"]]

    final = final.drop_duplicates()

    return final

In [3]:
verif_df = build_corpus(
    "pan21-authorship-verification-test-truth.jsonl",
    "pan21-authorship-verification-test.jsonl",
)
verif_df

Unnamed: 0,author,text
0,2072507,"talk because they hadn""t been exposed to commu..."
1,1404943,"Zazuki nodded his head and got to his feet, kn..."
2,2406271,"""Oh we did lots of special things. On Christma..."
3,189771,"""Hey now, at least Shido brings home some mone..."
4,965626,"It was a mere five minutes"" walk from third ye..."
...,...,...
39993,2264834,"He didn""t prevail. His brother was now in our ..."
39994,8582246,"You""re the fear, I don""t care""Cause I""ve never..."
39995,1421642,Authoress starts dancing around happily. Darkf...
39996,787585,"""Have you ever heard of the Turing test?"" ""No ..."


In [66]:
def split_by_authors(
    df: pd.DataFrame, n: int, rs: np.random.RandomState | int
) -> (pd.DataFrame, pd.DataFrame):
    a_idx, b_idx = (
        GroupShuffleSplit(n_splits=1, test_size=n, random_state=rs)
        .split(X=df["text"], y=None, groups=df["author"])
        .__next__()
    )
    a, b = (
        df.iloc[a_idx],
        df.iloc[b_idx],
    )
    return (a, b)


def wrong_author(r, frm: pd.Series, rs: np.random.RandomState) -> int:
    while True:
        x = frm.sample(1, random_state=rs).iloc[0]
        if x != r["author"]:
            return int(x)


def make_test_train(
    df: pd.DataFrame,
    rs: np.random.RandomState | int,
    n_true_samples: int = -1,
) -> tuple[pd.DataFrame, pd.DataFrame]:

    # The general strategy is to build a test frame with n_true_samples (ground truth =1) and the
    # same number of false samples (ground truth =0). The false samples are split half into authors
    # that exist in the training corpus, but have wrong test_against labels, and half samples that
    # are from never-seen authors.
    #
    # To do this we need:
    # - n samples from authors with at least 2 texts, with correct 'test_against'
    # - n/2 samples from authors with at least 2 texts, with incorrect 'test_against'
    # - n/2 samples from authors with 1 text, to go into the training set as noise
    # - n/2 samples from authors with 1 text to go into the test set with incorrect 'test_against'

    try:
        rs = np.random.RandomState(seed=int(rs))
    except TypeError:
        # hope it's a random state, if not it will error elsewhere
        pass

    vc = df["author"].value_counts()
    # map uses the value counts as a dict from author to count, so this filter chooses indices where the
    # value count meets whatever criteria
    single_df = pd.DataFrame(df[df["author"].map(vc) == 1])
    multi_df = pd.DataFrame(df[df["author"].map(vc) > 1])

    if n_true_samples > 0:
        if len(multi_df.author.unique()) < (n_true_samples * 1.5):
            raise ValueError(
                f"Can't make a test set with {n_true_samples} true samples. "
                f"Need {int(n_true_samples*1.5)} authors with 2+ texts, "
                f"have {len(multi_df.author.unique())}"
            )
    else:
        n_true_samples = len(multi_df.author.unique()) // 3 * 2

    rest, multi_t = split_by_authors(multi_df, n_true_samples, rs)
    rest, multi_f = split_by_authors(rest, n_true_samples // 2, rs)

    assert (
        (set(multi_t.author).isdisjoint(set(rest.author)))
        and (set(multi_f.author).isdisjoint(set(rest.author)))
        and (set(multi_t.author).isdisjoint(set(multi_f.author)))
    )

    assert (
        (len(multi_t.author.unique()) == n_true_samples)
        and (len(multi_f.author.unique()) == n_true_samples // 2)
        and (
            n_true_samples + n_true_samples // 2 + len(rest.author.unique())
            == len(multi_df.author.unique())
        )
    )

    # One sample from each of the multi authors goes into the test set for true and false. Those
    # samples are dropped from the training sets.
    multi_t_test = multi_t.groupby("author").sample(1, random_state=rs)
    multi_t_train = multi_t.drop(multi_t_test.index)
    multi_f_test = multi_f.groupby("author").sample(1, random_state=rs)
    multi_f_train = multi_f.drop(multi_f_test.index)

    single_train = single_df.sample(n_true_samples // 2, random_state=rs)
    single_df = single_df.drop(single_train.index)
    single_test = single_df.sample(n_true_samples // 2, random_state=rs)

    assert set(single_test.author).isdisjoint(set(single_train.author))

    all_authors = pd.Series(
        np.concatenate(
            [
                multi_t.author.unique(),
                multi_f.author.unique(),
                single_train.author.unique(),
            ]
        )
    )

    # Now build the dataframes

    # ground truth = T, test against real author ID
    multi_t_test = multi_t_test.assign(test_against=multi_t_test["author"])
    multi_t_test["gt"] = 1

    # ground truth = F, test against a random false author ID
    multi_f_test["test_against"] = multi_f_test.apply(
        lambda r: wrong_author(r, all_authors, rs), axis=1
    )
    multi_f_test["gt"] = 0

    single_test["test_against"] = single_test.apply(
        lambda r: wrong_author(r, all_authors, rs), axis=1
    )
    single_test["gt"] = 0

    test_df = pd.concat(
        [multi_t_test, multi_f_test, single_test], verify_integrity=True
    )
    train_df = pd.concat(
        [multi_t_train, multi_f_train, single_train], verify_integrity=True
    )

    # everything uses the original indices, so if these are disjoint then no text appears in both
    # test and train.
    assert(set(test_df.index).isdisjoint(set(train_df.index)))

    return (test_df, train_df)

In [93]:
verif_test, verif_train = make_test_train(verif_df, rs=42, n_true_samples=-1)

In [95]:
verif_test.to_csv("verif_test.csv")
verif_train.to_csv("verif_train.csv")

In [107]:
fit_corpus = build_corpus(
    "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small-truth.jsonl",
    "pan20-authorship-verification-training-small/pan20-authorship-verification-training-small.jsonl",
)
fit_corpus

Unnamed: 0,author,text
0,1446633,"I shift a bit, warily letting my eyes dart fro..."
2,1446633,A single tear escaped me as I left. I did have...
3,1446633,"""Ja."" Ludwig kept his gaze upon her, solidly. ..."
4,1446633,"And he did. Slowly, hesitantly...but coming fr..."
5,1446633,Thunderclan cats whip around. I stare directly...
...,...,...
105197,4353578,"""That a challenge, dick?"" Austin smirks back a..."
105198,5085069,"This is a KNB fanfic. KNB doesn""t belong to me..."
105199,6983686,"At the moment, Chris"" position on the swing se..."
105200,4369540,As you can see a lot of things has happened si...


In [98]:
verif_test["gt"].value_counts()

gt
1    5076
0    5076
Name: count, dtype: int64

In [1]:
verif_test.author.value_counts().value_counts()

NameError: name 'verif_test' is not defined

In [115]:
verif_train.author.value_counts().value_counts()

count
1     9143
5      905
11      94
19       7
29       3
Name: count, dtype: int64

In [116]:
9143 - 1692

7451

In [113]:
5076 / 3

1692.0

In [117]:
fit_test, fit_train = make_test_train(fit_corpus, rs=42, n_true_samples=500)

In [118]:
fit_test

Unnamed: 0,author,text,test_against,gt
23617,1007124,tree or two as he landed with a thunderous cra...,1007124,1
11760,100947,Dilandau opened his mouth to say something alo...,100947,1
24166,101296,"""Oh good... You remembered..."" He said hateful...",101296,1
15928,101393,Six children stood in the middle of the street...,101393,1
22992,1018526,"Fenrich straightened as the man approached, hi...",1018526,1
...,...,...,...,...
98429,367948,Yet another bright flash followed. This time t...,5477371,0
104217,4731033,"""She came to me in a dream she said after I ta...",83043,0
100587,9506334,"""Ok, sure"" I said, walking back over to him, a...",2750705,0
35184,2561263,"For now, his only option was to try to manuall...",3370103,0


In [119]:
fit_train

Unnamed: 0,author,text
127,357927,"""The Badaman Zero System is finished."" Armada ..."
133,357927,"""Hn, he still doesn""t know my true power."" Bai..."
134,357927,"""After being attacked by the Great Horn, she""s..."
135,357927,"""A male."" Chloe added, readying her transforma..."
136,357927,The Field Marshall merely smiled. SailorStar9:...
...,...,...
95263,445964,"He lays her down onto their bed, his body hove..."
28675,878883,While the preacher droned on in the background...
42726,3370103,"Julie: And we""re cornered. Tink: Now, which on..."
30026,880574,"""Your making this seem like a computer game. L..."


In [120]:
fit_test.to_csv("fit_test.csv")
fit_train.to_csv("fit_train.csv")