# Feature-wise analysis (S. 6.2)

Here I examine the conformance of various Ovidian (and non-Ovidian) works to general Ovidian style. It is demonstrated that using the Mahalanobis distance at a 99% confidence level is a fairly reliable indicator of Ovidian vs non-Ovidian authorship, and that none of the _Heroides_ display any statistical reason to reject them in terms of poetic style.

In [1]:
from mqdq import utils, babble, elegy
from mqdq import line_analyzer as la
from mqdq import mahalanobis as maha

import bs4
import glob

import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import chi2



In [2]:
vecs = pd.read_csv('elegy_poetic.csv',index_col=0)
corpus = vecs[vecs['LEN']>=20].reset_index(drop=True)
corpus = corpus.drop(['LEN'],axis=1)
test_corpus = corpus[corpus.Author != 'ps-Ovid'].reset_index(drop=True)
test_corpus

Unnamed: 0,Author,Work,Poem,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,Ovid,Ep.,Ep. 1,0.086207,0.500000,0.500000,0.448276,0.241379,0.706897,0.810345,...,0.120690,0.0,0.206897,0.068966,0.396552,1.000000,0.094828,4.393948,0.739842,0.000000
1,Ovid,Ep.,Ep. 2,0.189189,0.527027,0.581081,0.391892,0.283784,0.743243,0.878378,...,0.148649,0.0,0.202703,0.067568,0.337838,1.000000,0.114865,4.071062,1.027448,0.000000
2,Ovid,Ep.,Ep. 3,0.220779,0.493506,0.519481,0.480519,0.181818,0.597403,0.818182,...,0.155844,0.0,0.116883,0.025974,0.324675,1.000000,0.090909,3.845700,0.484285,0.000000
3,Ovid,Ep.,Ep. 4,0.102273,0.511364,0.545455,0.465909,0.147727,0.659091,0.829545,...,0.136364,0.0,0.215909,0.045455,0.329545,1.000000,0.073864,3.822098,0.893575,0.000000
4,Ovid,Ep.,Ep. 5,0.215190,0.455696,0.632911,0.417722,0.164557,0.658228,0.911392,...,0.164557,0.0,0.202532,0.037975,0.341772,1.000000,0.056962,3.727347,0.713715,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,Ovid,Pont.,Pont. 4 12,0.120000,0.520000,0.720000,0.440000,0.240000,0.560000,0.920000,...,0.200000,0.0,0.280000,0.040000,0.280000,1.000000,0.100000,3.073075,0.615696,0.000000
266,Ovid,Pont.,Pont. 4 13,0.160000,0.600000,0.680000,0.800000,0.280000,0.720000,0.920000,...,0.120000,0.0,0.240000,0.080000,0.360000,0.880000,0.100000,5.954342,1.723439,0.775629
267,Ovid,Pont.,Pont. 4 14,0.129032,0.580645,0.548387,0.645161,0.096774,0.774194,0.903226,...,0.387097,0.0,0.193548,0.032258,0.193548,0.903226,0.064516,3.314164,1.155196,0.591300
268,Ovid,Pont.,Pont. 4 15,0.095238,0.619048,0.666667,0.619048,0.285714,0.666667,0.714286,...,0.285714,0.0,0.285714,0.095238,0.333333,0.952381,0.119048,3.695211,0.919168,0.425918


In [3]:
problems = corpus[corpus.Author == 'ps-Ovid'].reset_index(drop=True)
problems

Unnamed: 0,Author,Work,Poem,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,ps-Ovid,Nux,Nux,0.153846,0.450549,0.626374,0.626374,0.175824,0.604396,0.868132,...,0.164835,0.0,0.197802,0.043956,0.285714,1.0,0.082418,3.09536,0.524756,0.0
1,ps-Ovid,Medicamina,Medicamina,0.28,0.48,0.52,0.54,0.18,0.62,0.88,...,0.1,0.0,0.2,0.04,0.3,1.0,0.08,4.901116,0.909967,0.0
2,ps-Ovid,Consolatio,Consolatio 1,0.240506,0.481013,0.64557,0.531646,0.164557,0.582278,0.924051,...,0.265823,0.0,0.088608,0.037975,0.278481,1.0,0.246835,4.619877,0.606677,0.0
3,ps-Ovid,Consolatio,Consolatio 2,0.253165,0.556962,0.556962,0.493671,0.240506,0.696203,0.810127,...,0.151899,0.0,0.088608,0.025316,0.240506,1.0,0.278481,3.608988,0.824542,0.0
4,ps-Ovid,Consolatio,Consolatio 3,0.329114,0.506329,0.658228,0.582278,0.291139,0.594937,0.772152,...,0.202532,0.0,0.151899,0.037975,0.240506,0.987342,0.202532,4.590044,1.062847,0.223589
5,ps-Ovid,Ibis,Ibis 1,0.15625,0.71875,0.5625,0.59375,0.15625,0.5625,0.90625,...,0.21875,0.0,0.1875,0.0,0.21875,1.0,0.109375,3.986751,1.05389,0.0
6,ps-Ovid,Ibis,Ibis 2,0.16,0.53,0.62,0.44,0.1,0.58,0.96,...,0.16,0.0,0.23,0.06,0.36,1.0,0.13,4.683774,0.994626,0.0
7,ps-Ovid,Ibis,Ibis 3,0.19,0.45,0.73,0.55,0.18,0.73,0.95,...,0.17,0.0,0.24,0.05,0.26,1.0,0.06,4.070276,0.787213,0.0
8,ps-Ovid,Ibis,Ibis 4,0.123596,0.438202,0.617978,0.52809,0.179775,0.685393,0.988764,...,0.382022,0.0,0.258427,0.05618,0.213483,0.977528,0.033708,4.358413,0.791811,0.469227


In [4]:
nux = problems[problems.Work=='Nux'].drop(['Author','Work','Poem'],axis=1).reset_index(drop=True)
cons1 = problems[problems.Poem=='Consolatio 1'].drop(['Author','Work','Poem'],axis=1).reset_index(drop=True)

In [5]:
# Take every poem by Ovid and use that to form the comparison distribution.
# The mean of this dataframe is the Ovidian centroid, but the whole frame
# is used to build the covariance matrix.

ovid_dist = test_corpus[test_corpus.Author=='Ovid'].drop(['Author','Work','Poem'],axis=1).reset_index(drop=True)
ovid_dist

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,0.086207,0.500000,0.500000,0.448276,0.241379,0.706897,0.810345,0.551724,0.586207,0.051724,...,0.120690,0.0,0.206897,0.068966,0.396552,1.000000,0.094828,4.393948,0.739842,0.000000
1,0.189189,0.527027,0.581081,0.391892,0.283784,0.743243,0.878378,0.594595,0.527027,0.081081,...,0.148649,0.0,0.202703,0.067568,0.337838,1.000000,0.114865,4.071062,1.027448,0.000000
2,0.220779,0.493506,0.519481,0.480519,0.181818,0.597403,0.818182,0.623377,0.519481,0.077922,...,0.155844,0.0,0.116883,0.025974,0.324675,1.000000,0.090909,3.845700,0.484285,0.000000
3,0.102273,0.511364,0.545455,0.465909,0.147727,0.659091,0.829545,0.636364,0.568182,0.045455,...,0.136364,0.0,0.215909,0.045455,0.329545,1.000000,0.073864,3.822098,0.893575,0.000000
4,0.215190,0.455696,0.632911,0.417722,0.164557,0.658228,0.911392,0.607595,0.607595,0.025316,...,0.164557,0.0,0.202532,0.037975,0.341772,1.000000,0.056962,3.727347,0.713715,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,0.120000,0.520000,0.720000,0.440000,0.240000,0.560000,0.920000,0.280000,0.720000,0.040000,...,0.200000,0.0,0.280000,0.040000,0.280000,1.000000,0.100000,3.073075,0.615696,0.000000
161,0.160000,0.600000,0.680000,0.800000,0.280000,0.720000,0.920000,0.560000,0.520000,0.000000,...,0.120000,0.0,0.240000,0.080000,0.360000,0.880000,0.100000,5.954342,1.723439,0.775629
162,0.129032,0.580645,0.548387,0.645161,0.096774,0.774194,0.903226,0.709677,0.677419,0.032258,...,0.387097,0.0,0.193548,0.032258,0.193548,0.903226,0.064516,3.314164,1.155196,0.591300
163,0.095238,0.619048,0.666667,0.619048,0.285714,0.666667,0.714286,0.571429,0.476190,0.238095,...,0.285714,0.0,0.285714,0.095238,0.333333,0.952381,0.119048,3.695211,0.919168,0.425918


In [56]:
test_corpus.Work.unique()

array(['Ep.', 'Tr.', 'Am.', 'Tib.', 'Prop.', 'Cat.', 'Pont.'],
      dtype=object)

In [6]:
ovid_late = test_corpus[test_corpus.Work.isin(['Trist.','Pont.'])].drop(['Author','Work','Poem'],axis=1).reset_index(drop=True)
ovid_early = test_corpus[test_corpus.Work.isin(['Ep.','Am.'])].drop(['Author','Work','Poem'],axis=1).reset_index(drop=True)

In [212]:
def explain(x, dist, prec=[]):

    """Calculate the Mahalanobis distance of a vector x from
    the distribution dist. This also returns a contribution vector
    which shows how much each feature contributes to the distance.

    You may notice that some features in that vector have negative values.
    The vector itself will always sum to a non-negative value, because
    of the nature of the calculation (the co-variance matrix is positive
    semi-definite). The negative value features occur when there is
    correlation. For example, if feature A scores contributes 20 to the
    distance, but it is correlated with feature B, then B might have a
    negative value to compensate for 'double counting' those features.

    The interpretability of the contribution vector is open to debate. It
    is not clear what statistical meaning the individual values have, but
    in my tests they have reflected what appear to be 'real' feature effects.

    Args:
        x (ideally a pandas.DataFrame): The observation to consider
        dist (pandas.DataFrame) : The distribution to take the distance from

    Returns:
        f (pandas.DataFrame): Feature contribution vector
        m (float64): Mahalanobis distance squared. m is the sum of f.
        p: The p-value calculated from m, assuming it follows a chi-square
           distribution, and using the default degrees of freedom (dim(dist)-1) 
    """

    # hat-tip: https://www.machinelearningplus.com/statistics/mahalanobis-distance/
    # NB: this produces the SQUARE of the distance ([x-m].C^{-1}.[x-m]^T)
    # which is what we want if we're claiming that the chi-sq distribution applies

    x_minus_mu = x - np.mean(dist,axis=0)
    if len(prec)==0:
        prec = ShrunkCovariance().fit(dist).get_precision()
    left_term = np.dot(x_minus_mu, prec)

    # for the normal Mahalanobis distance we would take the dot product here
    # but instead we multiply the vectors pointwise (as in .dot) but don't add
    # up the entries. This lets us see how much each column contributes to the
    # distance. NB I have NO IDEA what the statistical meaning of this is, since
    # we've moved one vector into some weird space defined by the covariance matrix.
    # It seems to have explanatory meaning, though.

    v = left_term*x_minus_mu

    m = np.dot(left_term, x_minus_mu.T)[0]
    p = 1 - chi2.cdf(m, len(x.columns)-1)[0]

    return (v, m[0], p)

In [203]:
from sklearn.covariance import (
    MinCovDet,
    OAS,
    LedoitWolf,
    GraphicalLassoCV,
    GraphicalLasso,
    graphical_lasso,
    empirical_covariance,
    EmpiricalCovariance,
    ShrunkCovariance
)

In [130]:
ovid_late

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,0.05,0.575,0.575,0.55,0.05,0.675,0.85,0.425,0.6,0.075,...,0.225,0.05,0.25,0.025,0.225,0.95,0.05,3.799265,0.501448,0.156125
1,0.22973,0.486486,0.648649,0.5,0.175676,0.581081,0.891892,0.527027,0.635135,0.094595,...,0.297297,0.0,0.283784,0.013514,0.22973,0.986486,0.087838,3.587619,0.798963,0.346379
2,0.12766,0.510638,0.765957,0.531915,0.106383,0.723404,0.893617,0.617021,0.595745,0.085106,...,0.170213,0.0,0.06383,0.021277,0.297872,1.0,0.053191,4.060497,0.890971,0.0
3,0.241379,0.344828,0.793103,0.586207,0.241379,0.448276,0.965517,0.689655,0.758621,0.0,...,0.137931,0.0,0.137931,0.034483,0.310345,1.0,0.086207,3.353129,1.009519,0.0
4,0.046512,0.534884,0.604651,0.55814,0.093023,0.627907,0.953488,0.55814,0.697674,0.0,...,0.209302,0.0,0.116279,0.069767,0.302326,1.0,0.034884,2.173764,0.382606,0.0
5,0.259259,0.555556,0.407407,0.703704,0.148148,0.518519,0.888889,0.666667,0.740741,0.037037,...,0.222222,0.037037,0.259259,0.037037,0.333333,0.962963,0.074074,3.862795,0.764844,0.188853
6,0.142857,0.542857,0.714286,0.571429,0.142857,0.657143,0.914286,0.628571,0.714286,0.057143,...,0.314286,0.0,0.228571,0.028571,0.342857,1.0,0.042857,3.282262,0.679044,0.0
7,0.083333,0.583333,0.611111,0.611111,0.138889,0.611111,0.944444,0.555556,0.722222,0.027778,...,0.194444,0.027778,0.222222,0.0,0.25,0.944444,0.097222,3.682622,0.761965,0.164336
8,0.071429,0.535714,0.607143,0.5,0.035714,0.607143,0.928571,0.535714,0.535714,0.035714,...,0.25,0.0,0.178571,0.0,0.142857,1.0,0.107143,3.92253,0.612321,0.0
9,0.136364,0.409091,0.5,0.590909,0.090909,0.636364,0.863636,0.545455,0.5,0.045455,...,0.363636,0.0,0.090909,0.0,0.454545,1.0,0.022727,3.68823,1.200574,0.0


In [164]:
graphical_lasso(empirical_covariance(ovid_early),0.5,eps=0.001,mode='lars')

FloatingPointError: The system is too ill-conditioned for this solver. The system is too ill-conditioned for this solver

In [185]:
pm = EmpiricalCovariance().fit(ovid_dist).get_precision()

In [204]:
e =ShrunkCovariance().fit(ovid_early).get_precision()
l = ShrunkCovariance().fit(ovid_late).get_precision()

In [230]:
cons1

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD


In [240]:
print_maha_res(nux, ovid_dist,lim=8, prec=pm)

------------------------------------
  M-dist 18.85,  p-value: 0.9988
  Feat 	 Score 	   Samp      Dist
------------------------------------
  RS     3.69      3.10      3.98
P1DI     3.48     40.66%    52.04%
H4SP     2.20     62.64%    53.93%
P2SP     1.88     71.43%    61.36%
H4DI     1.85     62.64%    51.38%
PFSD     1.28      0.00      0.08
H1SC     1.17     58.24%    49.26%
P3SC     1.02     16.48%    22.00%
  [truncating at limit = 8]
------------------------------------


In [34]:
print_maha_res(ep16, ovid_early,lim=8, shrinkage=0.01)

------------------------------------
  M-dist 30.51,  p-value: 0.8850
  Feat 	 Score 	   Samp      Dist
------------------------------------
PFSD    17.15      0.22      0.01
H1SP     3.06      8.51%    19.01%
H2SP     2.52     61.17%    49.75%
  RS     1.78      4.59      4.02
P1SC     1.32     31.91%    38.72%
P3WC     1.28     29.26%    35.74%
P2CF     0.95     69.68%    73.80%
 LEO     0.94      0.67      0.79
  [truncating at limit = 8]
------------------------------------


In [248]:
tst = ShrunkCovariance(shrinkage=0.0).fit(ovid_dist).get_precision()

In [193]:
explain(nux, ovid_late, pm)

(       H1SP     H2SP      H3SP      H4SP      H1CF     H2CF      H3CF   
 0 -0.101686  0.50752 -0.121477  1.147089 -0.015872 -0.04825  1.083216  \
 
        H4CF      H1DI      H2DI  ...     P3SC      P4SC      P1WC      P2WC   
 0  0.796956 -0.063409 -0.020973  ...  2.10796  0.361375 -0.015397 -0.002027  \
 
        P3WC      P4WC       ELC        RS       LEO      PFSD  
 0  0.031193  0.006928 -0.088135  3.062378  0.655394  2.849156  
 
 [1 rows x 42 columns],
 19.529571952403863,
 0.9982036939332487)

In [161]:
explain(nux, ovid_dist)

(       H1SP      H2SP      H3SP     H4SP      H1CF      H2CF      H3CF   
 0  0.047216 -0.184458  0.169806  0.63136  1.195193 -1.373672  2.378694  \
 
        H4CF     H1DI      H2DI  ...      P3SC      P4SC      P1WC      P2WC   
 0  0.229823  0.18267 -0.001894  ...  0.316136  5.009294  0.114316 -0.005013  \
 
        P3WC      P4WC       ELC        RS       LEO      PFSD  
 0  1.424606 -5.383233  0.039209  6.240332  0.558269 -0.453754  
 
 [1 rows x 42 columns],
 32.279090560719034,
 0.8330325428637224)

In [7]:
ep15 = test_corpus[test_corpus.Poem=='Ep. 15'].drop(['Author','Work','Poem'],axis=1)
ep15

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
14,0.275229,0.458716,0.633028,0.495413,0.155963,0.678899,0.87156,0.53211,0.559633,0.091743,...,0.155963,0.0,0.174312,0.055046,0.385321,1.0,0.12844,3.004543,0.633201,0.0


In [8]:
ep16 = test_corpus[test_corpus.Poem=='Ep. 16'].drop(['Author','Work','Poem'],axis=1)
ep16

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
15,0.085106,0.611702,0.659574,0.558511,0.143617,0.675532,0.914894,0.558511,0.558511,0.069149,...,0.228723,0.0,0.212766,0.047872,0.292553,0.994681,0.090426,4.589444,0.67045,0.218215


In [9]:
ep7 = test_corpus[test_corpus.Poem=='Ep. 7'].drop(['Author','Work','Poem'],axis=1)
ep7

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
6,0.091837,0.5,0.663265,0.469388,0.132653,0.642857,0.816327,0.55102,0.591837,0.102041,...,0.112245,0.0,0.193878,0.0,0.336735,1.0,0.153061,3.974571,0.819895,0.0


In [10]:
# A random poem to compare to Ovid

prop3_10 = test_corpus[test_corpus.Poem=='Prop. 3 10'].drop(['Author','Work','Poem'],axis=1)
prop3_10

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
193,0.4375,0.6875,0.625,0.6875,0.375,0.75,0.9375,0.3125,0.4375,0.0625,...,0.25,0.0,0.25,0.0625,0.3125,1.0,0.1875,4.598141,0.881995,0.0


# A note on the Mahalanobis distance

The redoutable wikipedia has a [quick primer](https://en.wikipedia.org/wiki/Mahalanobis_distance) on the Mahalanobis distance, but the intuition is not too difficult (at least for those with some undergraduate statistics!). It is more or less like the euclidean distance, except it takes into account correlations between features. For _m_ observations in an _n_ dimensional feature space, the _covariance matrix_ is an _n_ x _n_ matrix that describes all the pairwise correlations between the features. The inverse of this matrix is then used to "correct" for those correlations. Because of the way the vectors are multiplied, it is also possible to save the product vector to see exactly which features contribute the most distance to the overall score, which is a very useful tool for interpretability. Note that in all cases below I actually measure the _squared_ M-distance. This has no effect on any comparisons, but the squared M-distance is chi-square distributed, which makes it easy to calculate a _P_-value for any distance.

In [16]:
# The pretty printing method in my mahalanobis.py was written before
# any non-percentage features were added, so this is a hack :/


def print_maha_res(samp, dist, shrinkage=0.0, lim=-1):
    v, m, p = maha.explain(samp, dist, shrinkage)
    dist_cent = dist.mean(axis=0)
    print("-" * 36)
    print("  M-dist %.2f,  p-value: %.4f" % (m, p))
    print("  Feat \t Score \t   Samp      Dist")
    print("-" * 36)
    v = v.mean(axis=0).sort_values(ascending=False)
    i=0
    for feat, score in v.items():
        samp_n = samp[feat].iloc[0]
        dist_n = dist_cent[feat]
        if feat in ("LEO", "ELC", "RS", "LEN", "PFSD"):
            print(
                "%4.4s   %6.2f    %6.2f    %6.2f"
                % (feat, score, samp_n, dist_n)
            )
        else:
            print(
                "%4.4s   %6.2f    %6.2f%%   %6.2f%%"
                % (feat, score, samp_n * 100, dist_n * 100)
            )
        i += 1
        if i >= lim and lim > 0:
            print(f"  [truncating at limit = {lim}]")
            break
    print("-" * 36)

# Deconstructed Mahalanobis distance of _Ep._ 15 from Ovidian tendency

There are very few features that differ significantly from typical Ovidian style. The length shows up as a difference (all of the _Heroides_ are longer than most of Ovid's short elegy), as do some minor differences in the caesurae in the third and fourth feet of the hexameter, but that's about it. As for the _P_-value, there is clearly no statistical reason to reject the null hypothesis (ie no reason to reject the idea that it was written by Ovid).

In [19]:
# Nothing to see here

print_maha_res(prop3_10, ovid_dist, lim=8, shrinkage=0.1)

------------------------------------
  M-dist 64.66,  p-value: 0.0106
  Feat 	 Score 	   Samp      Dist
------------------------------------
H1SP    10.23     43.75%    15.72%
H4SC     5.83     37.50%    68.58%
H4CF     5.23     31.25%    56.28%
H1CF     5.22     37.50%    15.33%
H2SP     5.13     68.75%    52.11%
H1WC     4.24      6.25%    21.74%
H1DI     3.84     43.75%    59.69%
P3CF     3.65      0.00%    13.34%
  [truncating at limit = 8]
------------------------------------


In [147]:
problems_X = problems.drop(['Author','Work','Poem'],axis=1)
problems_X

Unnamed: 0,H1SP,H2SP,H3SP,H4SP,H1CF,H2CF,H3CF,H4CF,H1DI,H2DI,...,P3SC,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,PFSD
0,0.153846,0.450549,0.626374,0.626374,0.175824,0.604396,0.868132,0.538462,0.604396,0.065934,...,0.164835,0.0,0.197802,0.043956,0.285714,1.0,0.082418,3.09536,0.524756,0.0
1,0.28,0.48,0.52,0.54,0.18,0.62,0.88,0.6,0.54,0.04,...,0.1,0.0,0.2,0.04,0.3,1.0,0.08,4.901116,0.909967,0.0
2,0.240506,0.481013,0.64557,0.531646,0.164557,0.582278,0.924051,0.468354,0.531646,0.037975,...,0.265823,0.0,0.088608,0.037975,0.278481,1.0,0.246835,4.619877,0.606677,0.0
3,0.253165,0.556962,0.556962,0.493671,0.240506,0.696203,0.810127,0.392405,0.392405,0.088608,...,0.151899,0.0,0.088608,0.025316,0.240506,1.0,0.278481,3.608988,0.824542,0.0
4,0.329114,0.506329,0.658228,0.582278,0.291139,0.594937,0.772152,0.594937,0.443038,0.075949,...,0.202532,0.0,0.151899,0.037975,0.240506,0.987342,0.202532,4.590044,1.062847,0.223589
5,0.15625,0.71875,0.5625,0.59375,0.15625,0.5625,0.90625,0.5625,0.625,0.09375,...,0.21875,0.0,0.1875,0.0,0.21875,1.0,0.109375,3.986751,1.05389,0.0
6,0.16,0.53,0.62,0.44,0.1,0.58,0.96,0.54,0.61,0.02,...,0.16,0.0,0.23,0.06,0.36,1.0,0.13,4.683774,0.994626,0.0
7,0.19,0.45,0.73,0.55,0.18,0.73,0.95,0.59,0.54,0.04,...,0.17,0.0,0.24,0.05,0.26,1.0,0.06,4.070276,0.787213,0.0
8,0.123596,0.438202,0.617978,0.52809,0.179775,0.685393,0.988764,0.550562,0.460674,0.022472,...,0.382022,0.0,0.258427,0.05618,0.213483,0.977528,0.033708,4.358413,0.791811,0.469227


In [149]:
for x in problems.Poem:
    print(f"{x}:")
    print_maha_res(problems[problems.Poem==x].drop(['Author','Work','Poem'],axis=1), ovid_late,lim=8)

Nux:
------------------------------------
  M-dist 16.37,  p-value: 0.9998
  Feat 	 Score 	   Samp      Dist
------------------------------------
P1DI     1.36     40.66%    53.19%
  RS     1.12      3.10      3.91
H4WC     1.10      1.10%     5.23%
P2SP     1.03     71.43%    63.11%
H4DI     1.02     62.64%    53.78%
H1SC     0.88     58.24%    49.00%
H3WC     0.83      7.69%     4.18%
H3SC     0.83     92.31%    95.82%
  [truncating at limit = 8]
------------------------------------
Medicamina:
------------------------------------
  M-dist 25.44,  p-value: 0.9729
  Feat 	 Score 	   Samp      Dist
------------------------------------
H1SP     3.78     28.00%    14.20%
  RS     2.86      4.90      3.91
H3SC     2.27     90.00%    95.82%
H3WC     2.27     10.00%     4.18%
P3SC     2.25     10.00%    26.38%
H3SP     1.60     52.00%    64.35%
H4DI     1.24     44.00%    53.78%
H1SC     1.24     38.00%    49.00%
  [truncating at limit = 8]
------------------------------------
Consolatio 1:

# The method works, in general

A random poem by Propertius is, unsurprisingly, very not-Ovidian. Here I picked Propertius 3.10 more or less at random, to demonstrate that non-Ovidian works are usually easily detectable as non-Ovidian style.

The biggest differences (after correcting for feature covariance)
- Prop 3.10 is much more spondaic in the first and second feet
  of the hexameter than the Ovidian norm
- Prop. 3.10 is less likely to have a diaeresis in H1 (which happens when the
  first foot is a disyllable)
- the poem has much more elision than is typical for Ovid
- the poem has no ictus conflicts after the caesura in the pentameter, whereas
  Ovid apparently does this one line in seven or eight (although Prop 3.5 is only 32 lines)

In [14]:
print_maha_res(prop3_10, ovid_dist)

------------------------------------
  M-dist 108.84,  p-value: 0.0000
  Feat 	 Score 	   Samp      Dist
------------------------------------
H1SP    10.08     43.75%    15.72%
H1CF     8.91     37.50%    15.33%
H3CF     8.58     93.75%    89.50%
H1DI     7.73     43.75%    59.69%
H1WC     7.42      6.25%    21.74%
H4WC     7.27     12.50%     5.41%
P1CF     6.99     37.50%    26.87%
P3CF     6.97      0.00%    13.34%
H2SP     6.79     68.75%    52.11%
H4DI     5.49     62.50%    51.38%
P1WC     5.37     25.00%    18.68%
193    0.1875
Name: ELC, dtype: float64
 ELC     4.32      0.19      0.09
H3DI     3.72     12.50%    24.25%
H4CF     3.58     31.25%    56.28%
P2CF     3.52     81.25%    73.56%
H1SC     3.42     37.50%    49.26%
H4SP     3.26     68.75%    53.93%
H4SC     2.90     37.50%    68.58%
P2WC     2.06      6.25%     4.41%
P4CF     1.94      0.00%     0.75%
193    32
Name: LEN, dtype: int64
 LEN     1.08     32.00     79.45
P2SP     0.99     68.75%    61.36%
H3SC     0.85   

  print("%4.4s   %6.2f    %6.2f%%   %6.2f%%" % (feat, score, samp[feat]*100, dist_cent[feat]*100))
  print("%4.4s   %6.2f    %6.2f    %6.2f" % (feat, score, samp[feat], dist_cent[feat]))


# Testing the accuracy

_Heroides_ 15 reads as Ovidian, and Propertius 3.10 reads as non-Ovidian, but it is worth checking the general accuracy. Here I just look quickly at the number of false positives and negatives when working at the 99% confidence level. It seems that the method is not actually 99% accurate (which is not all that surprising) but nevertheless it does a very good job. 5 of 102 non-Ovidian works might be mistaken for Ovid (about 5%), and just 14 of 164 Ovidian works are sufficiently unusual as to read as non-Ovidian (2.4%), almost all of which are later works.


In [44]:
# A quick function we can apply to the dataframe to add the M-dist
# and p-value (compared to Ovidian style) for every work in the corpus

def maha_from_ovid(row):
    
    x = pd.DataFrame(row.drop(['Author','Work','Poem'])).reset_index(drop=True).T
    x.columns = ovid_dist.columns

    x_minus_mu = x - np.mean(ovid_dist, axis=0)
    cov = np.cov(ovid_dist.values.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)

    # for the normal Mahalanobis distance we would take the dot product here
    # but instead we multiply the vectors pointwise (as in .dot) but don't add
    # up the entries. This lets us see how much each column contributes to the
    # distance. 

    v = left_term*x_minus_mu

    m = np.array(np.dot(left_term, x_minus_mu.T)[0],dtype=np.float64)
    p = 1 - sp.stats.chi2.cdf(m, len(x.columns)-1)[0]
    return pd.Series([m[0],p])

In [21]:
dist_vecs = vecs_trim.apply(maha_from_ovid, axis=1)

In [22]:
dists = vecs_trim.copy()
dists.insert(3,'OvDist',dist_vecs[0])
dists.insert(4,'pval',dist_vecs[1])

In [45]:
# false positives - non-Ovidian detected as Ovid

dists[dists.Author != 'Ovid'].sort_values(by='OvDist').query('pval > 0.01')

Unnamed: 0,Author,Work,Poem,OvDist,pval,H1SP,H2SP,H3SP,H4SP,H1CF,...,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,LEN,PFSD
214,Propertius,Prop.,Prop. 4 11,38.302102,0.634049,0.313725,0.588235,0.627451,0.666667,0.313725,...,0.0,0.254902,0.019608,0.294118,1.0,0.127451,4.526738,1.39452,102,0.0
209,Propertius,Prop.,Prop. 4 6,48.673408,0.222227,0.27907,0.627907,0.72093,0.604651,0.139535,...,0.0,0.27907,0.093023,0.27907,1.0,0.162791,4.538231,1.191339,86,0.0
203,Propertius,Prop.,Prop. 3 24,57.081025,0.060268,0.315789,0.473684,0.578947,0.526316,0.210526,...,0.0,0.157895,0.0,0.421053,1.0,0.210526,4.517161,1.05325,38,0.0
207,Propertius,Prop.,Prop. 4 4,63.732554,0.016845,0.319149,0.553191,0.595745,0.638298,0.276596,...,0.0,0.06383,0.085106,0.404255,0.978723,0.234043,3.7969,1.176762,94,0.28861
121,Tibullus,Tib.,Tib. 1 4,64.833786,0.013395,0.309524,0.452381,0.666667,0.666667,0.214286,...,0.0,0.142857,0.02381,0.214286,0.952381,0.059524,4.867841,0.867571,84,0.543462


In [46]:
# false negatives - Ovidian detected as non-Ovidian

dists[dists.Author == 'Ovid'].sort_values(by='OvDist').query('pval < 0.01')

Unnamed: 0,Author,Work,Poem,OvDist,pval,H1SP,H2SP,H3SP,H4SP,H1CF,...,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,LEN,PFSD
248,Ovid,Pont.,Pont. 3 8,66.488886,0.009407274,0.166667,0.583333,0.416667,0.416667,0.0,...,0.0,0.416667,0.083333,0.166667,1.0,0.0,3.800638,0.341329,24,0.0
48,Ovid,Tr.,Tr. 4 2,66.50922,0.009365903,0.216216,0.540541,0.702703,0.702703,0.189189,...,0.0,0.162162,0.081081,0.243243,0.972973,0.135135,3.803756,0.986833,74,0.0
32,Ovid,Tr.,Tr. 2 1,66.778613,0.00883337,0.138408,0.570934,0.619377,0.525952,0.138408,...,0.0,0.217993,0.044983,0.273356,0.979239,0.086505,3.872211,0.845058,578,0.386243
81,Ovid,Am.,Am. 1 11,67.388686,0.007728845,0.142857,0.428571,0.642857,0.571429,0.214286,...,0.0,0.214286,0.0,0.5,1.0,0.285714,4.260925,0.588626,28,0.0
40,Ovid,Tr.,Tr. 3 8,70.400461,0.003918242,0.095238,0.52381,0.666667,0.666667,0.095238,...,0.0,0.190476,0.047619,0.380952,1.0,0.071429,2.727267,0.176037,42,0.0
92,Ovid,Am.,Am. 2 8,72.913442,0.002169994,0.214286,0.642857,0.428571,0.785714,0.214286,...,0.0,0.142857,0.0,0.428571,1.0,0.107143,4.158458,1.479839,28,0.0
262,Ovid,Pont.,Pont. 4 13,77.930369,0.0006278599,0.16,0.6,0.68,0.8,0.28,...,0.0,0.24,0.08,0.36,0.88,0.1,5.954342,1.723439,50,0.775629
246,Ovid,Pont.,Pont. 3 6,83.136986,0.0001604331,0.166667,0.433333,0.666667,0.533333,0.233333,...,0.033333,0.2,0.033333,0.333333,0.966667,0.083333,4.160358,1.065353,60,0.179505
69,Ovid,Tr.,Tr. 5 13,84.153643,0.0001218745,0.0,0.588235,0.588235,0.411765,0.058824,...,0.058824,0.294118,0.0,0.176471,0.941176,0.029412,3.437513,0.824276,34,0.0
24,Ovid,Tr.,Tr. 1 4,85.246547,9.042909e-05,0.071429,0.285714,0.857143,0.571429,0.071429,...,0.0,0.071429,0.214286,0.285714,0.928571,0.035714,3.982128,1.291192,28,0.515079


In [29]:
dists[dists.Author == 'Ovid'].shape[0]

164

In [30]:
dists[dists.Author != 'Ovid'].shape[0]

102

# All of the _Heroides_ sorted by M-distance (larger scores less like 'typical' Ovidian style

Note that even the most different are nowhere near different enough to be in statistical doubt. Of course this doesn't prove that they are Ovidian, but it does show that they conform superbly well with every feature that was measured.

In [210]:
dists[dists.Work == 'Ep.'].sort_values(by='OvDist')

Unnamed: 0,Author,Work,Poem,OvDist,pval,H1SP,H2SP,H3SP,H4SP,H1CF,...,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,LEN,PFSD
4,Ovid,Ep.,Ep. 5,13.390092,0.999992,0.21519,0.455696,0.632911,0.417722,0.164557,...,0.0,0.202532,0.037975,0.341772,1.0,0.056962,3.727347,0.713715,158,0.0
3,Ovid,Ep.,Ep. 4,17.860155,0.999602,0.102273,0.511364,0.545455,0.465909,0.147727,...,0.0,0.215909,0.045455,0.329545,1.0,0.073864,3.822098,0.893575,176,0.0
5,Ovid,Ep.,Ep. 6,19.191914,0.999029,0.168675,0.445783,0.554217,0.445783,0.180723,...,0.0,0.228916,0.036145,0.325301,1.0,0.090361,4.489172,0.689778,166,0.0
12,Ovid,Ep.,Ep. 13,19.308738,0.998955,0.17284,0.493827,0.54321,0.518519,0.160494,...,0.0,0.259259,0.049383,0.345679,1.0,0.098765,4.916439,0.755515,162,0.0
18,Ovid,Ep.,Ep. 19,19.625232,0.99873,0.104762,0.514286,0.647619,0.514286,0.171429,...,0.0,0.161905,0.028571,0.342857,0.990476,0.071429,4.11678,0.799599,210,0.194248
20,Ovid,Ep.,Ep. 21,22.911907,0.992792,0.211382,0.512195,0.682927,0.552846,0.219512,...,0.0,0.243902,0.02439,0.300813,1.0,0.065041,4.251277,0.895832,246,0.0
10,Ovid,Ep.,Ep. 11,23.64569,0.990019,0.078125,0.5625,0.5,0.5625,0.140625,...,0.0,0.140625,0.046875,0.390625,1.0,0.109375,4.608592,0.868259,128,0.0
19,Ovid,Ep.,Ep. 20,23.775092,0.989451,0.159664,0.529412,0.672269,0.470588,0.201681,...,0.008403,0.142857,0.02521,0.302521,0.991597,0.05042,3.802721,0.516711,238,0.0
2,Ovid,Ep.,Ep. 3,24.904209,0.983319,0.220779,0.493506,0.519481,0.480519,0.181818,...,0.0,0.116883,0.025974,0.324675,1.0,0.090909,3.8457,0.484285,154,0.0
17,Ovid,Ep.,Ep. 18,26.441128,0.970857,0.12844,0.633028,0.568807,0.412844,0.211009,...,0.009174,0.220183,0.036697,0.174312,0.990826,0.06422,3.807675,0.724245,218,0.0


# A quick note on Am. 3.5

_Amores_ 3.5 is not accepted by some editors. According to the M-distance, there is insufficient statistical reason to reject it on the grounds of poetic style. This is included as a fairly quick aside, but it may be of interest to some.

In [31]:
dists[dists.Poem=='Am. 3 5']

Unnamed: 0,Author,Work,Poem,OvDist,pval,H1SP,H2SP,H3SP,H4SP,H1CF,...,P4SC,P1WC,P2WC,P3WC,P4WC,ELC,RS,LEO,LEN,PFSD
108,Ovid,Am.,Am. 3 5,45.208894,0.339455,0.130435,0.521739,0.652174,0.521739,0.086957,...,0.0,0.304348,0.0,0.304348,1.0,0.086957,4.548982,0.481772,46,0.0
