In [22]:
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from lxml import etree
from MyCapytain.common.constants import Mimetypes
import pandas as pd

# Pausanias Lists
urns_pausanias = []
raw_xmls_pausanias = []
unannotated_strings_pausanias = []

# Iliad Lists
urns_iliad = []
raw_xmls_iliad = []
unannotated_strings_iliad = []

# Read Pausanias
with open("../tei/tlg0525.tlg001.perseus-eng2.xml") as f:
    textPausanias = CapitainsCtsText(urn="urn:cts:greekLit:tlg0525.tlg001.perseus-eng2", resource=f)

for ref in textPausanias.getReffs(level=len(textPausanias.citation)):
    urn = f"{textPausanias.urn}:{ref}"
    node = textPausanias.getTextualNode(ref)
    raw_xml = node.export(Mimetypes.XML.TEI)
    tree = node.export(Mimetypes.PYTHON.ETREE)
    s = etree.tostring(tree, encoding="unicode", method="text")

    urns_pausanias.append(urn)
    raw_xmls_pausanias.append(raw_xml)
    unannotated_strings_pausanias.append(s)

# Read Iliad
with open("../tei/tlg0012.tlg002.perseus-eng3.xml") as f:
    textIliad = CapitainsCtsText(urn="urn:cts:greekLit:tlg0012.tlg002.perseus-eng3", resource=f)

for ref in textIliad.getReffs(level=len(textIliad.citation)):
    urn = f"{textIliad.urn}:{ref}"
    node = textIliad.getTextualNode(ref)
    raw_xml = node.export(Mimetypes.XML.TEI)
    tree = node.export(Mimetypes.PYTHON.ETREE)
    s = etree.tostring(tree, encoding="unicode", method="text")

    urns_iliad.append(urn)
    raw_xmls_iliad.append(raw_xml)
    unannotated_strings_iliad.append(s)

# Create DataFrames for Pausanias
pausanias_df = pd.DataFrame({
    "urn": pd.Series(urns_pausanias, dtype="string"),
    "raw_xml": raw_xmls_pausanias,
    "unannotated_strings": pd.Series(unannotated_strings_pausanias, dtype="string")
})

# Create DataFrames for Iliad
iliad_df = pd.DataFrame({
    "urn": pd.Series(urns_iliad, dtype="string"),
    "raw_xml": raw_xmls_iliad,
    "unannotated_strings": pd.Series(unannotated_strings_iliad, dtype="string")
})


In [23]:
import spacy

# Load spaCy model without NER component
nlp = spacy.load("en_core_web_sm", disable=["ner"])

# Process Pausanias texts
raw_texts_pausanias = [t for t in pausanias_df['unannotated_strings']]
annotated_texts_pausanias = nlp.pipe(raw_texts_pausanias, batch_size=100)

# Process Iliad texts
raw_texts_iliad = [t for t in iliad_df['unannotated_strings']]
annotated_texts_iliad = nlp.pipe(raw_texts_iliad, batch_size=100)

# Add spaCy annotated docs to Pausanias DataFrame
pausanias_df['nlp_docs'] = list(annotated_texts_pausanias)

# Add spaCy annotated docs to Iliad DataFrame
iliad_df['nlp_docs'] = list(annotated_texts_iliad)


In [24]:
# Extract all tokens that have the lemma "the" from Pausanias' nlp_docs
definite_article_pausanias = [token for doc in pausanias_df['nlp_docs'] for token in doc if token.lemma_ == "the"]

# Count the number of occurrences
len_definite_article_pausanias = len(definite_article_pausanias)

len_definite_article_pausanias


26932

In [25]:
# Extract all tokens that have the lemma "the" from Iliad's nlp_docs
definite_article_iliad = [token for doc in iliad_df['nlp_docs'] for token in doc if token.lemma_ == "the"]

# Count the number of occurrences
len_definite_article_iliad = len(definite_article_iliad)

len_definite_article_iliad


8159

In [28]:
pausanias_df['whitespaced_tokens'] = pausanias_df['unannotated_strings'].str.split()

pausanias_df

Unnamed: 0,urn,raw_xml,unannotated_strings,nlp_docs,whitespaced_tokens
0,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",On the Greek mainland facing the Cyclades Isla...,"(On, the, Greek, mainland, facing, the, Cyclad...","[On, the, Greek, mainland, facing, the, Cyclad..."
1,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...","The Peiraeus was a parish from early times, th...","(The, Peiraeus, was, a, parish, from, early, t...","[The, Peiraeus, was, a, parish, from, early, t..."
2,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",The most noteworthy sight in the Peiraeus is a...,"(The, most, noteworthy, sight, in, the, Peirae...","[The, most, noteworthy, sight, in, the, Peirae..."
3,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...","The Athenians have also another harbor, at Mun...","(The, Athenians, have, also, another, harbor, ...","[The, Athenians, have, also, another, harbor,,..."
4,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",Twenty stades away is the Coliad promontory; o...,"(Twenty, stades, away, is, the, Coliad, promon...","[Twenty, stades, away, is, the, Coliad, promon..."
...,...,...,...,...,...
3165,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...","These, then, live above Amphissa. On the coast...","(These, ,, then, ,, live, above, Amphissa, ., ...","[These,, then,, live, above, Amphissa., On, th..."
3166,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",I gather that the city got its name from a wom...,"(I, gather, that, the, city, got, its, name, f...","[I, gather, that, the, city, got, its, name, f..."
3167,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",The epic poem called the Naupactia by the Gree...,"(The, epic, poem, called, the, Naupactia, by, ...","[The, epic, poem, called, the, Naupactia, by, ..."
3168,urn:cts:greekLit:tlg0525.tlg001.perseus-eng2:1...,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xmlns...",Here there is on the coast a temple of Poseido...,"(Here, there, is, on, the, coast, a, temple, o...","[Here, there, is, on, the, coast, a, temple, o..."


In [29]:
from collections import Counter

type_counts = Counter(pausanias_df['whitespaced_tokens'].explode())

type_counts.most_common(50)

[('the', 24911),
 ('of', 14285),
 ('and', 8371),
 ('to', 7400),
 ('a', 5701),
 ('in', 4514),
 ('is', 4258),
 ('was', 3382),
 ('that', 3374),
 ('by', 2818),
 ('they', 2323),
 ('from', 2216),
 ('his', 1983),
 ('The', 1938),
 ('at', 1860),
 ('on', 1859),
 ('he', 1849),
 ('with', 1827),
 ('for', 1642),
 ('it', 1624),
 ('who', 1533),
 ('their', 1527),
 ('are', 1461),
 ('as', 1447),
 ('were', 1419),
 ('but', 1331),
 ('son', 1301),
 ('this', 1182),
 ('had', 1120),
 ('not', 1113),
 ('which', 1045),
 ('have', 962),
 ('an', 949),
 ('I', 930),
 ('also', 844),
 ('called', 827),
 ('them', 819),
 ('when', 791),
 ('be', 765),
 ('one', 739),
 ('made', 725),
 ('him', 633),
 ('after', 618),
 ('sanctuary', 605),
 ('there', 589),
 ('all', 587),
 ('say', 580),
 ('been', 555),
 ('her', 539),
 ('other', 516)]

In [34]:
iliad_df['whitespaced_tokens'] = iliad_df['unannotated_strings'].str.split()

iliad_df


type_counts = Counter(iliad_df['whitespaced_tokens'].explode())

type_counts.most_common(50)

[('the', 8100),
 ('and', 6287),
 ('of', 4091),
 ('to', 3190),
 ('in', 2266),
 ('a', 1824),
 ('he', 1679),
 ('I', 1592),
 ('his', 1539),
 ('that', 1378),
 ('for', 1236),
 ('with', 1203),
 ('my', 986),
 ('thou', 968),
 ('as', 860),
 ('from', 847),
 ('all', 822),
 ('they', 792),
 ('is', 776),
 ('him', 768),
 ('was', 700),
 ('it', 699),
 ('on', 680),
 ('me', 655),
 ('had', 636),
 ('but', 626),
 ('thy', 620),
 ('when', 575),
 ('her', 562),
 ('But', 559),
 ('not', 547),
 ('their', 529),
 ('she', 518),
 ('them', 489),
 ('Then', 462),
 ('who', 459),
 ('will', 455),
 ('be', 442),
 ('at', 429),
 ('Odysseus', 426),
 ('this', 425),
 ('And', 415),
 ('by', 396),
 ('we', 395),
 ('have', 394),
 ('one', 370),
 ('thee', 367),
 ('no', 362),
 ('So', 361),
 ('upon', 355)]

In [35]:
%run -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [36]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["ner"])

In [37]:
tokenizer = nlp.tokenizer

pausanias_df['tokens'] = pausanias_df['unannotated_strings'].apply(tokenizer)
iliad_df['tokens'] = iliad_df['unannotated_strings'].apply(tokenizer)

In [38]:
types = [t.text for t in pausanias_df['tokens'].explode() if not t.is_stop and t.is_alpha]

type_counts = Counter(types)

type_counts.most_common(200)

[('son', 1363),
 ('called', 897),
 ('sanctuary', 655),
 ('city', 598),
 ('image', 518),
 ('temple', 475),
 ('said', 474),
 ('time', 473),
 ('Lacedaemonians', 465),
 ('men', 427),
 ('place', 427),
 ('came', 395),
 ('Zeus', 391),
 ('Apollo', 366),
 ('land', 360),
 ('Athenians', 357),
 ('war', 350),
 ('man', 343),
 ('Heracles', 341),
 ('statue', 337),
 ('daughter', 328),
 ('people', 316),
 ('river', 306),
 ('dedicated', 298),
 ('Greeks', 293),
 ('god', 288),
 ('sea', 272),
 ('king', 270),
 ('won', 265),
 ('water', 263),
 ('Athena', 260),
 ('Artemis', 260),
 ('stades', 257),
 ('battle', 249),
 ('day', 236),
 ('took', 235),
 ('images', 235),
 ('bronze', 235),
 ('Messenians', 235),
 ('road', 233),
 ('Achaeans', 231),
 ('death', 230),
 ('story', 230),
 ('sons', 227),
 ('left', 226),
 ('account', 225),
 ('altar', 221),
 ('set', 220),
 ('old', 217),
 ('race', 212),
 ('killed', 210),
 ('brought', 207),
 ('gave', 203),
 ('come', 199),
 ('Olympia', 197),
 ('far', 192),
 ('built', 190),
 ('Athens',

In [39]:
types = [t.text for t in iliad_df['tokens'].explode() if not t.is_stop and t.is_alpha]

type_counts = Counter(types)

type_counts.most_common(200)

[('thou', 1007),
 ('Odysseus', 623),
 ('thy', 623),
 ('thee', 600),
 ('spoke', 569),
 ('men', 520),
 ('man', 478),
 ('come', 478),
 ('heart', 437),
 ('son', 384),
 ('house', 368),
 ('went', 302),
 ('ship', 297),
 ('land', 292),
 ('forth', 281),
 ('came', 274),
 ('Zeus', 266),
 ('shall', 260),
 ('gods', 257),
 ('wise', 257),
 ('Telemachus', 255),
 ('sea', 243),
 ('great', 243),
 ('wooers', 238),
 ('answered', 235),
 ('set', 229),
 ('let', 212),
 ('tell', 209),
 ('father', 204),
 ('long', 202),
 ('said', 200),
 ('fair', 199),
 ('goodly', 183),
 ('hands', 182),
 ('verily', 181),
 ('way', 181),
 ('comrades', 175),
 ('halls', 172),
 ('Athena', 158),
 ('words', 157),
 ('dear', 151),
 ('brought', 151),
 ('evil', 150),
 ('old', 150),
 ('god', 149),
 ('took', 148),
 ('far', 147),
 ('ships', 142),
 ('wine', 137),
 ('like', 137),
 ('lay', 135),
 ('hast', 135),
 ('art', 132),
 ('city', 132),
 ('saying', 131),
 ('day', 130),
 ('stranger', 130),
 ('straightway', 128),
 ('home', 126),
 ('sat', 125),
