# Cluster Analysis: Lexico-grammatical style (S. 5.2)

In [2]:
from mqdq import babble, ngrams, elegy

import numpy as np
import pandas as pd

import glob



In [2]:
import warnings

warnings.filterwarnings("ignore")
import logging

logging.basicConfig(level="INFO")

In [3]:
collection = []

# Several lines need to be manually deleted, because when we make wide vectors
# we treat couplets as a unit (so we must have a matching number of H and P.)
# In some poems, we have corrupt lines, and so we delete the H that matches
# a corrupt P and vice versa.

ep = babble.bookbabs("corpus/OV-epis.xml", name="Ep.")
for b in ep:
    b.author = "Ovid"
collection.extend(ep)

tr = babble.multi_bookbabs(sorted(glob.glob("corpus/OV-tri*.xml")), name="Tr.")
for b in tr:
    b.author = "Ovid"
collection.extend(tr)

am = babble.multi_bookbabs(sorted(glob.glob("corpus/OV-amo*.xml")), name="Am.")
for b in am:
    b.author = "Ovid"
collection.extend(am)

tib = babble.multi_bookbabs(sorted(glob.glob("corpus/TIB-ele*.xml")), name="Tib.")
del tib[1].raw_source[24]
for b in tib:
    b.author = "Tibullus"
collection.extend(tib)

prop = babble.multi_bookbabs(sorted(glob.glob("corpus/PROP-ele*.xml")), name="Prop.")
for b in prop:
    b.author = "Propertius"
del prop[55].raw_source[28]
collection.extend(prop)

cat = babble.bookbabs("corpus/CATVLL-carm.xml", name="Cat.")
cat_ele = [x for x in cat if x.elegiac and len(x) > 20]
for b in cat_ele:
    b.author = "Catullus"
del cat_ele[3].raw_source[46]
collection.extend(cat_ele)

pon = babble.multi_bookbabs(sorted(glob.glob("corpus/OV-pon*.xml")), name="Pont.")
for b in pon:
    b.author = "Ovid"
del pon[1].raw_source[8]
del pon[7].raw_source[18]
collection.extend(pon)

In [4]:
nux = babble.Babbler.from_file("./corpus/OV-nux.xml", name="Nux")
nux.author = "ps-Ovid"
collection.append(nux)
ibis = babble.Babbler.from_file("./corpus/OV-ibis.xml", name="Ibis")
ibis.author = "ps-Ovid"
medi = babble.Babbler.from_file("./corpus/OV-medi.xml", name="Medicamina")
medi.author = "ps-Ovid"
collection.append(medi)
cons = babble.Babbler.from_file("./corpus/OV-cons.xml", name="consolatio")
cons.author = "ps-Ovid"

In [5]:
ll = cons.raw_source
sz = 158
cons_chunks = [ll[i : i + sz] for i in range(0, len(ll), sz)]
cons_ary = []
for i in range(len(cons_chunks)):
    b = babble.Babbler(cons_chunks[i], name=f"Consolatio {i+1}")
    b.author = "ps-Ovid"
    cons_ary.append(b)
collection.extend(cons_ary)
[f"{x.name} - {len(x)}" for x in cons_ary]

['Consolatio 1 - 158', 'Consolatio 2 - 158', 'Consolatio 3 - 158']

In [6]:
ll = ibis.raw_source
sz = 200
# I'm going to declare the "intro" to end at line 66, and the actual 'curse' to
# begin at 67 'di maris et terrae...'. I break them up this way because the
# style is noticably different in the 'intro', whereas the bulk of the rest
# follows standard tropes for invocations (huge amounts of polysyndeton for
# example)
ibis_chunks = [ll[i : i + sz] for i in range(64, len(ll), sz)]
ibis_chunks = [ll[:64]] + ibis_chunks
ibis_ary = []
for i in range(len(ibis_chunks)):
    b = babble.Babbler(ibis_chunks[i], name=f"Ibis {i+1}")
    b.author = "ps-Ovid"
    ibis_ary.append(b)
collection.extend(ibis_ary)
[f"{x.name} - {len(x)}" for x in ibis_ary]

['Ibis 1 - 64', 'Ibis 2 - 200', 'Ibis 3 - 200', 'Ibis 4 - 178']

In [7]:
non_elegy = []

aen_single_bab = babble.Babbler.from_file(
    "corpus/VERG-aene.xml", name="Aeneid", author="Vergil"
)
non_elegy.append(aen_single_bab)

geo_single_bab = babble.Babbler.from_file(
    "corpus/VERG-geor.xml", name="Georgics", author="Vergil"
)
non_elegy.append(geo_single_bab)

sat_single_bab = babble.Babbler.from_file(
    "corpus/IVV-satu.xml", name="Juv. Sat.", author="Juvenal"
)
non_elegy.append(sat_single_bab)

puni_single_bab = babble.Babbler.from_file(
    "corpus/SIL-puni.xml", name="Punica", author="Silius"
)
non_elegy.append(puni_single_bab)

theb_single_bab = babble.Babbler.from_file(
    "corpus/STAT-theb.xml", name="Thebaid", author="Statius"
)
non_elegy.append(theb_single_bab)

phars_single_bab = babble.Babbler.from_file(
    "corpus/LVCAN-phar.xml", name="Pharsalia", author="Lucan"
)
non_elegy.append(phars_single_bab)

arg_single_bab = babble.Babbler.from_file(
    "corpus/VAL_FL-argo.xml", name="Argonautica", author="V.Flaccus"
)
non_elegy.append(arg_single_bab)

rena_single_bab = babble.Babbler.from_file(
    "corpus/LVCR-rena.xml", name="DRN", author="Lucretius"
)
non_elegy.append(rena_single_bab)

horsat_single_bab = babble.Babbler.from_file(
    *sorted(glob.glob("corpus/HOR-sat*.xml")), name="Hor. Sat.", author="Horace"
)
non_elegy.append(horsat_single_bab)

In [8]:
def subsample(
    ary: list[babble.Babbler], mu, sd: float, n: int, min_length: int = 0
) -> list[babble.Babbler]:
    samps: list[babble.Babbler] = []
    lengths = [
        x for x in np.random.normal(mu, sd, n * 2).astype("int") if x > min_length
    ]
    for i in range(n):
        work = ary[i % len(ary)]
        l = lengths[i]
        start = np.random.randint(len(work) - l)
        b = babble.Babbler(
            work.raw_source[start : start + l],
            name=f"{i}-{work.name}",
            author=work.author,
        )
        samps.append(b)
    return samps

In [9]:
# 200 samples, mean length 80 lines with SD of 20

non_elegy_samples = subsample(non_elegy, 80, 20, 200)

# Preprocessing - Text Conversion

For this analysis the vectorisation is a little different
to the poetic analysis. The first step is to convert each
poem into a string and perform phonetic conversion. Named
Entity Removal is not done. In the case of the Heroides,
each poem (or each pair, in the Double letters) uses a different
set of characters, so there is little risk that poems might be artificially
clustered with the Heroides simply because of Named Entities.

In [11]:
# the +_just stringify+ method is from my MQDQParser library.

def vectorise_babs(babs):
    vecs = []
    for b in babs:
        v = pd.DataFrame()
        v["Chunk"] = ["".join(ngrams._just_stringify(b.raw_source, type="phon"))]
        v.insert(0, "Poem", b.name)
        v.insert(0, "Work", b.name.split(" ")[0])
        if hasattr(b, "author"):
            v.insert(0, "Author", b.author)
        vecs.append(v)
    df = pd.concat(vecs)
    df.reset_index(drop=True, inplace=True)
    return df

In [12]:
# Now everything is nicely lowercased with punctuation
# stripped, but retaining linebreaks. Phonetic conversion
# has also been done.

vecs = vectorise_babs(collection)
vecs.insert(3, "LEN", [len(b) for b in collection])
vecs

Unnamed: 0,Author,Work,Poem,LEN,Chunk
0,Ovid,Ep.,Ep. 1,116,hank tua penelope lento tibi mittit ulikse\nni...
1,Ovid,Ep.,Ep. 2,148,hospita demopoon tua te rodopeia pyllis\nultra...
2,Ovid,Ep.,Ep. 3,154,kwam legis a rapta briseide littera wenit\nwik...
3,Ovid,Ep.,Ep. 4,176,kwam nisi tu dederis karitura_st ipsa salutem\...
4,Ovid,Ep.,Ep. 5,158,perlegis an konjunks prohibet nowa perlege non...
...,...,...,...,...,...
282,ps-Ovid,Consolatio,Consolatio 3,158,kwo raperis laniata komas similiskwe furenti\n...
283,ps-Ovid,Ibis,Ibis 1,64,tempus ad hok lustris bis jam mihi kwinkwe per...
284,ps-Ovid,Ibis,Ibis 2,200,di maris et terrae kwi_kwis meliora tenetis\ni...
285,ps-Ovid,Ibis,Ibis 3,200,kwi_kwokulis karuit per kwos male widerat auru...


In [13]:
non_elegy_vecs = vectorise_babs(non_elegy_samples)
non_elegy_vecs.insert(3, "LEN", [len(b) for b in non_elegy_samples])

non_elegy_vecs

Unnamed: 0,Author,Work,Poem,LEN,Chunk
0,Vergil,0-Aeneid,0-Aeneid,75,haek ait et sokii kesserunt aekwore jusso\nat ...
1,Vergil,1-Georgics,1-Georgics,88,kontemplator item kum se nuks plurima silwis\n...
2,Juvenal,2-Juv.,2-Juv. Sat.,54,delikias hominis tarpeium limen adora\npronus ...
3,Silius,3-Punica,3-Punica,87,pergame_indinjum solymo _sewadere detur\nhuik ...
4,Statius,4-Thebaid,4-Thebaid,42,ira_ret motos kapulis adstringeret enses\nhink...
...,...,...,...,...,...
195,V.Flaccus,195-Argonautica,195-Argonautica,98,si pelopis duros prior hippodamia labores\neks...
196,Lucretius,196-DRN,196-DRN,93,dekiderunt kwo_kwet in talis wenere meatus\nkw...
197,Horace,197-Hor.,197-Hor. Sat.,94,eksirem plures kalones atkwe kaballi\npaskendi...
198,Vergil,198-Aeneid,198-Aeneid,106,in lukem genito_ramyko dedit et fake praenjas\...


In [139]:
vecs.to_csv('elegy_corpus.csv')
non_elegy_vecs.to_csv('non_elegy_corpus.csv')

In [7]:
elegy_poetic = elegy.vectorise_babs(collection)
elegy_poetic.to_csv('elegy_poetic.csv')