# Cluster Analysis: Lexico-grammatical style (S. 5.2)

In [1]:
from mqdq import utils, babble, ngrams

import numpy as np
import pandas as pd
import scipy as sp

import glob

from sklearn.preprocessing import Normalizer, LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from ruzicka.Order2Verifier import Order2Verifier

In [2]:
import warnings

warnings.filterwarnings("ignore")
import logging

logging.basicConfig(level="INFO")

In [3]:
elegy_vecs = pd.read_csv("elegy_corpus.csv", index_col=0)
elegy_corpus = elegy_vecs[elegy_vecs.LEN >= 20]
hexameter_vecs = pd.read_csv("non_elegy_corpus.csv", index_col=0)

In [4]:
test_corpus = pd.concat(
    [elegy_corpus[elegy_corpus.Author != "ps-Ovid"], hexameter_vecs]
).reset_index(drop=True)
test_corpus

Unnamed: 0,Author,Work,Poem,LEN,Chunk
0,Ovid,Ep.,Ep. 1,116,hank tua penelope lento tibi mittit ulikse\nni...
1,Ovid,Ep.,Ep. 2,148,hospita demopoon tua te rodopeia pyllis\nultra...
2,Ovid,Ep.,Ep. 3,154,kwam legis a rapta briseide littera wenit\nwik...
3,Ovid,Ep.,Ep. 4,176,kwam nisi tu dederis karitura_st ipsa salutem\...
4,Ovid,Ep.,Ep. 5,158,perlegis an konjunks prohibet nowa perlege non...
...,...,...,...,...,...
465,V.Flaccus,195-Argonautica,195-Argonautica,98,si pelopis duros prior hippodamia labores\neks...
466,Lucretius,196-DRN,196-DRN,93,dekiderunt kwo_kwet in talis wenere meatus\nkw...
467,Horace,197-Hor.,197-Hor. Sat.,94,eksirem plures kalones atkwe kaballi\npaskendi...
468,Vergil,198-Aeneid,198-Aeneid,106,in lukem genito_ramyko dedit et fake praenjas\...


In [5]:
lenc = LabelEncoder()
labels = lenc.fit_transform(test_corpus.Author)

In [6]:
from ruzicka.score_shifting import ScoreShifter
from ruzicka.evaluation import pan_metrics
from sklearn.preprocessing import MinMaxScaler

In [7]:
# Kestemont flavoured GI relies on 'fitting' the score shifting to optimise the
# combination of C@1 accuracy and the AUC score. However, to calculate the AUC
# we need an idea of true negatives (correctly identifying that the sample does
# not match the alleged label). This method simple takes the X, y data and
# appends a copy of X where the label is incorrect (uniformly random untrue
# label)


def make_up_lies(X, y):
    lies_labels = []
    n_labels = max(y) + 1
    for lab in y:
        while True:
            r = np.random.randint(n_labels)
            if r != lab:
                lies_labels.append(r)
                break
    ret_X = np.concatenate([X, X.copy()])
    ret_y = np.concatenate([y, lies_labels])
    ground_truth = np.concatenate([[1.0] * len(X), [0.0] * len(X)])
    return (ret_X, ret_y, ground_truth)

In [8]:
logger = logging.getLogger("ruzicka")

In [9]:
# set to logging.DEBUG or higher for less noise

for handler in logger.handlers:
    handler.setLevel(logging.INFO)

In [10]:
# Verifier options

verifier_nini = Order2Verifier(
    metric="nini", base="instance", nb_bootstrap_iter=500, rnd_prop=0.35
)

verifier_minmax = Order2Verifier(
    metric="minmax", base="instance", nb_bootstrap_iter=500, rnd_prop=0.35
)

In [11]:
# Splitter

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [12]:
# Vectorizer options

vec_ngrams_std = make_pipeline(
    TfidfVectorizer(
        sublinear_tf=True,
        use_idf=False,
        norm="l2",
        analyzer="char",
        ngram_range=(2, 4),
        max_features=5000,
    ),
    StandardScaler(with_mean=False),
    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
    Normalizer(),
)

vec_5grams = make_pipeline(
    TfidfVectorizer(
        sublinear_tf=True,
        use_idf=False,
        norm="l2",
        analyzer="char",
        ngram_range=(5, 5),
        max_features=5000,
    ),
    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
)

In [13]:
def fit_shifter(
    X,
    y,
    vectorizer,
    verifier,
    shifter,
    test_size=0.2,
):
    logger.info(f"Fitting the provided score shifter on a {test_size*100}% sample")
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
    for i, (train_index, test_index) in enumerate(splitter.split(X, y)):
        train_X = vectorizer.fit_transform(X[train_index], y[train_index])
        verifier.fit(train_X, y[train_index])
        test_X_raw = vectorizer.transform(X[test_index])
        logger.info("Running verifier on sub-sample")
        test_X, test_y, test_gt = make_up_lies(test_X_raw, y[test_index])
        test_scores = verifier.predict_proba(test_X, test_y, nb_imposters=30)
        logger.info(f"Actually fitting...")
        shifter.fit(predicted_scores=test_scores, ground_truth_scores=test_gt)
    return shifter

In [14]:
def benchmark_imposters(X, y, splitter, vectorizer, verifier, shifter):
    accs = []
    c_at_1s = []
    for i, (train_index, test_index) in enumerate(splitter.split(X, y)):
        train_X = vectorizer.fit_transform(X[train_index], y[train_index])
        verifier.fit(train_X, y[train_index])
        test_X_raw = vectorizer.transform(X[test_index])
        test_X, test_y, test_gt = make_up_lies(test_X_raw, y[test_index])
        test_scores = verifier.predict_proba(test_X, test_y, nb_imposters=30)
        logger.info(f"Transforming {len(test_scores)} test scores...")
        test_scores = shifter.transform(test_scores)
        dev_acc_score, dev_auc_score, dev_c_at_1_score = pan_metrics(
            prediction_scores=test_scores, ground_truth_scores=test_gt
        )
        logger.info(f"Accuracy:  {dev_acc_score}")
        logger.info(f"AUC:  {dev_auc_score}")
        logger.info(f"c@1:  {dev_c_at_1_score}")
        logger.info(f"AUC x c@1:  {dev_auc_score * dev_c_at_1_score}")
        accs.append(dev_acc_score)
        c_at_1s.append(dev_c_at_1_score)
    return (accs, c_at_1s)

In [15]:
shifter = fit_shifter(
    test_corpus.Chunk,
    labels,
    test_size=0.2,
    vectorizer=vec_5grams,
    verifier=verifier_nini,
    shifter=ScoreShifter(min_spread=0.2),
)
aa, cc = benchmark_imposters(
    test_corpus.Chunk, labels, sss, vec_5grams, verifier_nini, shifter
)
print(sp.stats.describe(aa))
print(sp.stats.describe(cc))

09/11/2023 05:05:56 [ruzicka:INFO] Fitting the provided score shifter on a 20.0% sample
09/11/2023 05:05:57 [ruzicka:INFO] Fitting on 376 documents in instance mode...
09/11/2023 05:05:57 [ruzicka:INFO] Running verifier on sub-sample
09/11/2023 05:05:57 [ruzicka:INFO] Predicting on 188 documents
09/11/2023 05:06:45 [ruzicka:INFO] Actually fitting...
09/11/2023 05:06:50 [ruzicka:INFO] p1 for optimal combo: 0.59
09/11/2023 05:06:50 [ruzicka:INFO] p2 for optimal combo: 0.794
09/11/2023 05:06:50 [ruzicka:INFO] AUC for optimal combo: 0.9955862381167949
09/11/2023 05:06:50 [ruzicka:INFO] c@1 for optimal combo: 0.9554096876414667
09/11/2023 05:06:51 [ruzicka:INFO] Fitting on 423 documents in instance mode...
09/11/2023 05:06:51 [ruzicka:INFO] Predicting on 94 documents
09/11/2023 05:07:17 [ruzicka:INFO] Transforming 94 test scores...
09/11/2023 05:07:17 [ruzicka:INFO] Accuracy:  0.8297872340425532
09/11/2023 05:07:17 [ruzicka:INFO] AUC:  0.9968311453146219
09/11/2023 05:07:17 [ruzicka:INFO] c

DescribeResult(nobs=10, minmax=(0.8297872340425532, 0.9148936170212766), mean=0.876595744680851, variance=0.0008349680599567421, skewness=-0.23565052105781978, kurtosis=-0.9938307446654093)
DescribeResult(nobs=10, minmax=(0.9208918062471706, 0.965142598460842), mean=0.9454504300588502, variance=0.00024022802128071785, skewness=-0.3218546731184984, kurtosis=-1.2880073646521866)


In [26]:
shifter = fit_shifter(
    test_corpus.Chunk,
    labels,
    test_size=0.2,
    vectorizer=vec_5grams,
    verifier=verifier_nini,
    shifter=ScoreShifter(min_spread=0.2),
)

09/11/2023 04:55:37 [ruzicka:INFO] Fitting the provided score shifter on a 20.0% sample
09/11/2023 04:55:38 [ruzicka:INFO] Running verifier on sub-sample
09/11/2023 04:55:38 [ruzicka:INFO] Training on 188 documents
09/11/2023 04:56:27 [ruzicka:INFO] Actually fitting...
09/11/2023 04:56:31 [ruzicka:INFO] p1 for optimal combo: 0.59
09/11/2023 04:56:31 [ruzicka:INFO] p2 for optimal combo: 0.794
09/11/2023 04:56:31 [ruzicka:INFO] AUC for optimal combo: 0.997906292440018
09/11/2023 04:56:31 [ruzicka:INFO] c@1 for optimal combo: 0.9579560887279311


In [41]:
ngrams_std = make_pipeline(
    TfidfVectorizer(
        sublinear_tf=True,
        use_idf=False,
        norm="l2",
        analyzer="char",
        ngram_range=(2, 4),
        max_features=5000,
    ),
    StandardScaler(with_mean=False),
    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
    Normalizer(),
)

In [42]:
aa, cc = benchmark_imposters(
    test_corpus.Chunk, labels, sss, ngrams_std, verifier, shifter
)
print(sp.stats.describe(aa))
print(sp.stats.describe(cc))

08/07/2023 03:50:40 [ruzicka:INFO] Training on 423 training documents
08/07/2023 03:50:44 [ruzicka:INFO] # test documents processed: 10 out of 94
08/07/2023 03:50:47 [ruzicka:INFO] # test documents processed: 20 out of 94
08/07/2023 03:50:51 [ruzicka:INFO] # test documents processed: 30 out of 94
08/07/2023 03:50:54 [ruzicka:INFO] # test documents processed: 40 out of 94
08/07/2023 03:50:57 [ruzicka:INFO] # test documents processed: 50 out of 94
08/07/2023 03:50:59 [ruzicka:INFO] # test documents processed: 60 out of 94
08/07/2023 03:51:02 [ruzicka:INFO] # test documents processed: 70 out of 94
08/07/2023 03:51:04 [ruzicka:INFO] # test documents processed: 80 out of 94
08/07/2023 03:51:07 [ruzicka:INFO] # test documents processed: 90 out of 94
08/07/2023 03:51:08 [ruzicka:INFO] Transforming 94 test scores...
08/07/2023 03:51:08 [ruzicka:INFO] Accuracy:  0.925531914893617
08/07/2023 03:51:08 [ruzicka:INFO] AUC:  0.9950203712086917
08/07/2023 03:51:08 [ruzicka:INFO] c@1:  0.9255319148936

DescribeResult(nobs=10, minmax=(0.8829787234042553, 0.9361702127659575), mean=0.9170212765957448, variance=0.0002716161158895425, skewness=-0.7257747386024538, kurtosis=-0.18724279835387359)
DescribeResult(nobs=10, minmax=(0.9148936170212766, 0.946129470348574), mean=0.9273992756903574, variance=8.53233427440122e-05, skewness=0.7423540401258779, kurtosis=-0.22444426820789998)


In [43]:
real_verifier = Order2Verifier(
    metric="nini", base="instance", nb_bootstrap_iter=1000, rnd_prop=0.35
)

In [44]:
real_verifier.fit(vectorizer.fit_transform(test_corpus.Chunk), labels)

In [45]:
problems = corpus[corpus.Author == "ps-Ovid"]
problems

Unnamed: 0,Author,Work,Poem,LEN,Chunk
278,ps-Ovid,Nux,Nux,182,nuks ego junkta wiae kum sim sine krimine wita...
279,ps-Ovid,Medicamina,Medicamina,100,diskite kwae fakiem kommendet kura puellae\net...
280,ps-Ovid,Consolatio,Consolatio 1,158,wisa diu feliks mater modo dikta neronum\njam ...
281,ps-Ovid,Consolatio,Consolatio 2,158,at_kwutinam drusi manus alte_ret altera fratri...
282,ps-Ovid,Consolatio,Consolatio 3,158,kwo raperis laniata komas similiskwe furenti\n...
283,ps-Ovid,Ibis,Ibis 1,64,tempus ad hok lustris bis jam mihi kwinkwe per...
284,ps-Ovid,Ibis,Ibis 2,200,di maris et terrae kwi_kwis meliora tenetis\ni...
285,ps-Ovid,Ibis,Ibis 3,200,kwi_kwokulis karuit per kwos male widerat auru...
286,ps-Ovid,Ibis,Ibis 4,178,aut te dewoweat kertis abdera diebus\nsaksakwe...


In [46]:
shifter.transform(
    real_verifier.predict_proba(
        vectorizer.transform(problems.Chunk),
        lenc.transform(["Ovid"] * len(problems)),
        nb_imposters=30,
    )
)

[0.968197662791219,
 0.716,
 0.9999872841514559,
 0.8919025715266478,
 0.9554818142471244,
 0.9809135113353149,
 0.9364080414309806,
 0.9427659657030285,
 0.8919025715266492]

In [51]:
real_verifier.predict_proba(
    vectorizer.transform(problems.Chunk),
    lenc.transform(["Ovid"] * len(problems)),
    nb_imposters=30,
)

array([1.        , 0.97566667, 1.        , 0.992     , 0.9975    ,
       0.9995    , 0.9995    , 0.99633333, 0.993     ])

In [48]:
test_corpus.Author.unique()

array(['Ovid', 'Tibullus', 'Propertius', 'Catullus', 'Vergil', 'Juvenal',
       'Silius', 'Statius', 'Lucan', 'V.Flaccus', 'Lucretius', 'Horace'],
      dtype=object)