# Feature-wise analysis (S. 6.2)

Here I examine the conformance of various Ovidian (and non-Ovidian) works to general Ovidian style. It is demonstrated that using the Mahalanobis distance at a 99% confidence level is a fairly reliable indicator of Ovidian vs non-Ovidian authorship, and that none of the _Heroides_ display any statistical reason to reject them in terms of poetic style.

In [1]:
from mqdq import mahalanobis as maha
import pandas as pd

In [2]:
vecs = pd.read_csv("elegy_poetic.csv", index_col=0)
corpus = vecs[vecs["LEN"] >= 20].reset_index(drop=True)
corpus = corpus.drop(["LEN"], axis=1)
test_corpus = corpus[corpus.Author != "ps-Ovid"].reset_index(drop=True)
test_corpus

Unnamed: 0,Author,Work,Poem,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,Ovid,Ep.,Ep. 1,0.086207,0.500000,0.500000,0.448276,0.241379,0.706897,0.810345,...,0.120690,0.0,0.206897,0.068966,0.396552,1.000000,0.094828,4.393948,0.739842,0.000000
1,Ovid,Ep.,Ep. 2,0.189189,0.527027,0.581081,0.391892,0.283784,0.743243,0.878378,...,0.148649,0.0,0.202703,0.067568,0.337838,1.000000,0.114865,4.071062,1.027448,0.000000
2,Ovid,Ep.,Ep. 3,0.220779,0.493506,0.519481,0.480519,0.181818,0.597403,0.818182,...,0.155844,0.0,0.116883,0.025974,0.324675,1.000000,0.090909,3.845700,0.484285,0.000000
3,Ovid,Ep.,Ep. 4,0.102273,0.511364,0.545455,0.465909,0.147727,0.659091,0.829545,...,0.136364,0.0,0.215909,0.045455,0.329545,1.000000,0.073864,3.822098,0.893575,0.000000
4,Ovid,Ep.,Ep. 5,0.215190,0.455696,0.632911,0.417722,0.164557,0.658228,0.911392,...,0.164557,0.0,0.202532,0.037975,0.341772,1.000000,0.056962,3.727347,0.713715,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,Ovid,Pont.,Pont. 4 12,0.120000,0.520000,0.720000,0.440000,0.240000,0.560000,0.920000,...,0.200000,0.0,0.280000,0.040000,0.280000,1.000000,0.100000,3.073075,0.615696,0.000000
266,Ovid,Pont.,Pont. 4 13,0.160000,0.600000,0.680000,0.800000,0.280000,0.720000,0.920000,...,0.120000,0.0,0.240000,0.080000,0.360000,0.880000,0.100000,5.954342,1.723439,0.775629
267,Ovid,Pont.,Pont. 4 14,0.129032,0.580645,0.548387,0.645161,0.096774,0.774194,0.903226,...,0.387097,0.0,0.193548,0.032258,0.193548,0.903226,0.064516,3.314164,1.155196,0.591300
268,Ovid,Pont.,Pont. 4 15,0.095238,0.619048,0.666667,0.619048,0.285714,0.666667,0.714286,...,0.285714,0.0,0.285714,0.095238,0.333333,0.952381,0.119048,3.695211,0.919168,0.425918


In [3]:
problems = corpus[corpus.Author == "ps-Ovid"].reset_index(drop=True)
problems

Unnamed: 0,Author,Work,Poem,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,ps-Ovid,Nux,Nux,0.153846,0.450549,0.626374,0.626374,0.175824,0.604396,0.868132,...,0.164835,0.0,0.197802,0.043956,0.285714,1.0,0.082418,3.09536,0.524756,0.0
1,ps-Ovid,Medicamina,Medicamina,0.28,0.48,0.52,0.54,0.18,0.62,0.88,...,0.1,0.0,0.2,0.04,0.3,1.0,0.08,4.901116,0.909967,0.0
2,ps-Ovid,Consolatio,Consolatio 1,0.240506,0.481013,0.64557,0.531646,0.164557,0.582278,0.924051,...,0.265823,0.0,0.088608,0.037975,0.278481,1.0,0.246835,4.619877,0.606677,0.0
3,ps-Ovid,Consolatio,Consolatio 2,0.253165,0.556962,0.556962,0.493671,0.240506,0.696203,0.810127,...,0.151899,0.0,0.088608,0.025316,0.240506,1.0,0.278481,3.608988,0.824542,0.0
4,ps-Ovid,Consolatio,Consolatio 3,0.329114,0.506329,0.658228,0.582278,0.291139,0.594937,0.772152,...,0.202532,0.0,0.151899,0.037975,0.240506,0.987342,0.202532,4.590044,1.062847,0.223589
5,ps-Ovid,Ibis,Ibis 1,0.15625,0.71875,0.5625,0.59375,0.15625,0.5625,0.90625,...,0.21875,0.0,0.1875,0.0,0.21875,1.0,0.109375,3.986751,1.05389,0.0
6,ps-Ovid,Ibis,Ibis 2,0.16,0.53,0.62,0.44,0.1,0.58,0.96,...,0.16,0.0,0.23,0.06,0.36,1.0,0.13,4.683774,0.994626,0.0
7,ps-Ovid,Ibis,Ibis 3,0.19,0.45,0.73,0.55,0.18,0.73,0.95,...,0.17,0.0,0.24,0.05,0.26,1.0,0.06,4.070276,0.787213,0.0
8,ps-Ovid,Ibis,Ibis 4,0.123596,0.438202,0.617978,0.52809,0.179775,0.685393,0.988764,...,0.382022,0.0,0.258427,0.05618,0.213483,0.977528,0.033708,4.358413,0.791811,0.469227


In [4]:
cd = dict(
    corpus.apply(
        lambda r: [r.Poem, corpus[corpus.Poem == r.Poem].iloc[:, 3:]],
        axis=1,
    ).to_numpy()
)

In [5]:
ovid_dist = (
    test_corpus[test_corpus.Author == "Ovid"]
    .drop(["Author", "Work", "Poem"], axis=1)
    .reset_index(drop=True)
)
ovid_late = (
    test_corpus[test_corpus.Work.isin(["Trist.", "Pont."])]
    .drop(["Author", "Work", "Poem"], axis=1)
    .reset_index(drop=True)
)
ovid_early = (
    test_corpus[test_corpus.Work.isin(["Ep.", "Am."])]
    .drop(["Author", "Work", "Poem"], axis=1)
    .reset_index(drop=True)
)

In [6]:
maha.compare_elegy(cd["Nux"], ovid_late, lim=10, shrinkage=0.05)

------------------------------------
  M-dist 27.84,  p-value: 0.9419
  Feat 	 Score 	   Samp      Dist
------------------------------------
P1DI     3.61     40.66%    53.19%
P3SC     3.52     16.48%    26.38%
H1SC     3.00     58.24%    49.00%
H4SP     2.95     62.64%    57.19%
H4DI     2.05     62.64%    53.78%
P2SP     1.95     71.43%    63.11%
PFSD     1.75      0.00      0.16
H2SP     1.54     45.05%    52.72%
 LEO     1.17      0.52      0.82
P1SC     1.07     34.07%    40.05%
  [truncating at limit = 10]
------------------------------------


In [7]:
maha.compare_elegy(cd["Nux"], ovid_dist, lim=10, shrinkage=0.0)

------------------------------------
  M-dist 18.85,  p-value: 0.9988
  Feat 	 Score 	   Samp      Dist
------------------------------------
  RS     3.69      3.10      3.98
P1DI     3.48     40.66%    52.04%
H4SP     2.20     62.64%    53.93%
P2SP     1.88     71.43%    61.36%
H4DI     1.85     62.64%    51.38%
PFSD     1.28      0.00      0.08
H1SC     1.17     58.24%    49.26%
P3SC     1.02     16.48%    22.00%
H3WC     0.99      7.69%     5.39%
H3CF     0.78     86.81%    89.50%
  [truncating at limit = 10]
------------------------------------


In [8]:
maha.compare_elegy(cd["Consolatio 1"], ovid_dist, lim=10, shrinkage=0.0)

------------------------------------
  M-dist 56.07,  p-value: 0.0586
  Feat 	 Score 	   Samp      Dist
------------------------------------
 ELC    20.84      0.25      0.09
H4WC     9.92     13.92%     5.41%
H2WC     6.45     24.05%    10.07%
P2CF     4.13     60.76%    73.56%
P3CF     2.41     21.52%    13.34%
H3CF     2.37     92.41%    89.50%
H3SC     2.27     93.67%    94.40%
  RS     2.14      4.62      3.98
H2SC     2.01     39.24%    58.88%
P4CF     1.91      0.00%     0.75%
  [truncating at limit = 10]
------------------------------------


In [9]:
maha.compare_elegy(cd["Consolatio 3"], ovid_dist, lim=10, shrinkage=0.0)

------------------------------------
  M-dist 81.24,  p-value: 0.0002
  Feat 	 Score 	   Samp      Dist
------------------------------------
H3SC   134.15     79.75%    94.40%
H3CF     9.68     77.22%    89.50%
PFSD     8.53      0.22      0.08
 ELC     7.93      0.20      0.09
H1SP     6.25     32.91%    15.72%
  RS     2.60      4.59      3.98
H1DI     2.28     44.30%    59.69%
H2CF     2.12     59.49%    64.95%
H1CF     2.04     29.11%    15.33%
 LEO     1.75      1.06      0.79
  [truncating at limit = 10]
------------------------------------


In [10]:
maha.compare_elegy(cd["Ibis 4"], ovid_dist, lim=10, shrinkage=0.0)

------------------------------------
  M-dist 88.52,  p-value: 0.0000
  Feat 	 Score 	   Samp      Dist
------------------------------------
PFSD    46.84      0.47      0.08
H3CF    10.72     98.88%    89.50%
H2SC    10.52     41.57%    58.88%
P3SC     8.66     38.20%    22.00%
H1DI     5.93     46.07%    59.69%
H1SC     5.46     30.34%    49.26%
P2SC     4.06     52.81%    65.21%
H4WC     3.67     11.24%     5.41%
P1CF     3.43     15.73%    26.87%
P1WC     2.68     25.84%    18.68%
  [truncating at limit = 10]
------------------------------------


In [11]:
from scipy.stats import chi2

v, m, p = maha.explain(cd["Ibis 4"], ovid_dist, shrinkage=0.0)
biggest = sorted(v.to_numpy()[0])[-1]
print(f"Trying with M-dist {m - biggest:.2f} instead of {m:.2f}")
new_p = 1 - chi2.cdf(m - biggest, len(cd["Ibis 4"].columns) - 1)
print(f"Counterfactual p-val: {new_p:.2f}")

Trying with M-dist 41.67 instead of 88.52
Counterfactual p-val: 0.44


# A note on the Mahalanobis distance

The redoutable wikipedia has a [quick primer](https://en.wikipedia.org/wiki/Mahalanobis_distance) on the Mahalanobis distance, but the intuition is not too difficult (at least for those with some undergraduate statistics!). It is more or less like the euclidean distance, except it takes into account correlations between features. For _m_ observations in an _n_ dimensional feature space, the _covariance matrix_ is an _n_ x _n_ matrix that describes all the pairwise correlations between the features. The inverse of this matrix is then used to "correct" for those correlations. Because of the way the vectors are multiplied, it is also possible to save the product vector to see exactly which features contribute the most distance to the overall score, which is a very useful tool for interpretability. Note that in all cases below I actually measure the _squared_ M-distance. This has no effect on any comparisons, but the squared M-distance is chi-square distributed, which makes it easy to calculate a _P_-value for any distance.

# Deconstructed Mahalanobis distance of _Ep._ 15 from Ovidian tendency

There are very few features that differ significantly from typical Ovidian style. The length shows up as a difference (all of the _Heroides_ are longer than most of Ovid's short elegy), as do some minor differences in the caesurae in the third and fourth feet of the hexameter, but that's about it. As for the _P_-value, there is clearly no statistical reason to reject the null hypothesis (ie no reason to reject the idea that it was written by Ovid).

# The method works, in general

A random poem by Propertius is, unsurprisingly, very not-Ovidian. Here I picked Propertius 3.10 more or less at random, to demonstrate that non-Ovidian works are usually easily detectable as non-Ovidian style.

The biggest differences (after correcting for feature covariance)
- Prop 3.10 is much more spondaic in the first and second feet
  of the hexameter than the Ovidian norm
- Prop. 3.10 is less likely to have a diaeresis in H1 (which happens when the
  first foot is a disyllable)
- the poem has much more elision than is typical for Ovid
- the poem has no ictus conflicts after the caesura in the pentameter, whereas
  Ovid apparently does this one line in seven or eight (although Prop 3.5 is only 32 lines)

# Testing the accuracy

_Heroides_ 15 reads as Ovidian, and Propertius 3.10 reads as non-Ovidian, but it is worth checking the general accuracy. Here I just look quickly at the number of false positives and negatives when working at the 99% confidence level. It seems that the method is not actually 99% accurate (which is not all that surprising) but nevertheless it does a very good job. 5 of 102 non-Ovidian works might be mistaken for Ovid (about 5%), and just 14 of 164 Ovidian works are sufficiently unusual as to read as non-Ovidian (2.4%), almost all of which are later works.


In [12]:
# A quick function we can apply to the dataframe to add the M-dist
# and p-value (compared to Ovidian style) for every work in the corpus


def maha_from_ovid(row, dist, shrinkage):
    _, m, p = maha.explain(
        corpus[corpus.Poem == row.Poem].iloc[:, 3:],
        dist,
        shrinkage,
    )
    return pd.Series([m, p])

In [13]:
dist_vecs = corpus.apply(maha_from_ovid, args=(ovid_dist, 0.0), axis=1)

In [14]:
dists = corpus.copy()
dists.insert(3, "OvDist", dist_vecs[0])
dists.insert(4, "pval", dist_vecs[1])

In [None]:
dists[dists.Author == "ps-Ovid"].sort_values(by="OvDist").iloc[:, :5]

Unnamed: 0,Author,Work,Poem,OvDist,pval
276,ps-Ovid,Ibis,Ibis 2,13.835231,0.999978
270,ps-Ovid,Nux,Nux,18.848283,0.998812
271,ps-Ovid,Medicamina,Medicamina,19.109011,0.998604
277,ps-Ovid,Ibis,Ibis 3,26.882836,0.956321
275,ps-Ovid,Ibis,Ibis 1,46.243903,0.264658
272,ps-Ovid,Consolatio,Consolatio 1,56.065557,0.05861
273,ps-Ovid,Consolatio,Consolatio 2,60.273737,0.026468
274,ps-Ovid,Consolatio,Consolatio 3,81.237912,0.000184
278,ps-Ovid,Ibis,Ibis 4,88.516539,2.4e-05


In [24]:
# false positives - non-Ovidian detected as Ovid

dists[~dists.Author.isin(["Ovid", "ps-Ovid"])].sort_values(by="OvDist").query(
    "pval > 0.01"
).iloc[:, :5]

Unnamed: 0,Author,Work,Poem,OvDist,pval
218,Propertius,Prop.,Prop. 4 11,37.8964,0.609343
213,Propertius,Prop.,Prop. 4 6,49.251443,0.176459
207,Propertius,Prop.,Prop. 3 24,56.335577,0.055833
211,Propertius,Prop.,Prop. 4 4,63.237763,0.014431
122,Tibullus,Tib.,Tib. 1 4,64.36,0.011362


In [22]:
# false negatives - Ovidian detected as non-Ovidian

dists[dists.Author == "Ovid"].sort_values(by="OvDist").query("pval < 0.01").iloc[:, :5]

Unnamed: 0,Author,Work,Poem,OvDist,pval
45,Ovid,Tr.,Tr. 3 13,65.035165,0.009816442
48,Ovid,Tr.,Tr. 4 2,66.528854,0.007059382
81,Ovid,Am.,Am. 1 11,67.797081,0.00530098
40,Ovid,Tr.,Tr. 3 8,70.2148,0.003021778
118,Ovid,Am.,Am. 3 15,71.322876,0.002319912
92,Ovid,Am.,Am. 2 8,73.623514,0.001322948
266,Ovid,Pont.,Pont. 4 13,77.271169,0.0005249784
250,Ovid,Pont.,Pont. 3 6,84.061254,8.500692e-05
69,Ovid,Tr.,Tr. 5 13,84.701512,7.115578e-05
24,Ovid,Tr.,Tr. 1 4,86.06109,4.861191e-05


# All of the _Heroides_ sorted by M-distance (larger scores less like 'typical' Ovidian style

Note that even the most different are nowhere near different enough to be in statistical doubt. Of course this doesn't prove that they are Ovidian, but it does show that they conform superbly well with every feature that was measured.

In [23]:
dists[dists.Work == "Ep."].sort_values(by="OvDist").iloc[:, :5]

Unnamed: 0,Author,Work,Poem,OvDist,pval
4,Ovid,Ep.,Ep. 5,12.441285,0.999995
3,Ovid,Ep.,Ep. 4,15.707609,0.999878
20,Ovid,Ep.,Ep. 21,16.409644,0.999785
15,Ovid,Ep.,Ep. 16,17.353912,0.999563
12,Ovid,Ep.,Ep. 13,17.82262,0.999393
19,Ovid,Ep.,Ep. 20,17.89685,0.999361
5,Ovid,Ep.,Ep. 6,18.167086,0.999234
18,Ovid,Ep.,Ep. 19,18.809297,0.99884
17,Ovid,Ep.,Ep. 18,23.141463,0.988925
11,Ovid,Ep.,Ep. 12,23.6342,0.98638
