In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tf.app import use
from tf.convert.tei import TEI
from tf.convert.addnlp import NLPipeline
from tf.advanced.helpers import dm

In [3]:
ORG = "annotation"
REPO = "mondriaan"

# Inspecting - exploring - testing

After the conversion, we inspect the corpus, explore what is in it and what you can do with it,
and test whether things have been converted rightly.

# Use the new dataset

We can now use the resulting dataset in the usual way.
Because we have adapted the TF app, the version without the `pre` will now be loaded.

In [12]:
A = use(f"{ORG}/{REPO}:clone", checkout="clone", hoist=globals())

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
folder,2,8855.5,100
bibliolist,1,3144.0,18
listBibl,2,1546.5,17
file,16,1106.94,100
letter,14,1001.5,79
body,16,664.69,60
text,16,664.69,60
artworklist,1,546.0,3
listObject,1,501.0,3
standOff,14,358.71,28


# Exploration

We walk around a bit more in the corpus.

## All titles:

In [13]:
for t in F.otype.s("titleStmt"):
    print(t, T.text(t))

20159 Brief aan Aletta de Iongh. Amsterdam, dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20160 Brief aan Aletta de Iongh. Amsterdam, woensdag 7 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20161 Brief aan Aletta de Iongh. Amsterdam, tussen maandag 19 en vrijdag 23 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20162 Brief aan Aletta de Iongh. Amsterdam, maandag 26 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20163 Brief aan Aletta de Iongh. Amsterdam, donderdag 13 mei 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20164 Brief aan Aletta de Iongh. Amsterdam, donderdag 24 juni 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20165 Brief aan Aletta de Iongh. Amsterdam, eerste helft augustus 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

20166  Briefkaart aan Gerrit Willem Knap. Zoutelande, c. dinsdag 24 augustus 1909.
Wietse Coppes
Leo Jansen
Mondria

# Tokens

Show all the tokens that are split into atomic tokens

In [14]:
query = """
token
  =: t
  <: t
"""

results = A.search(query)

  0.03s 12 results


In [15]:
A.table(results, condenseType="token")

n,p,token,t,t.1
1,proeftuin@19090407y_IONG_1739:5,S t,S,t
2,proeftuin@19090426y_IONG_1738:5,1 e,1,e
3,proeftuin@19090513y_IONG_1293:5,S t,S,t
4,proeftuin@19090624_IONG_1294:6,kaon,k,a
5,proeftuin@19090824y_KNAP_1747:5,42II,42,II
6,proeftuin@19090824y_KNAP_1747:6,Vr.nVrienden,Vr.,n
7,proeftuin@19090905y_IONG_1295:5,4.a,4,.a
8,proeftuin@19091024y_IONG_1297:5,Nov.ber,Nov.,ber
9,proeftuin@19100131_SAAL_ARNO_0018:5,onjuist,on,juist
10,proeftuin@19100131_SAAL_ARNO_0018:5,een,e,en


## Sentences

In [16]:
for s in F.otype.s("sentence")[2:4]:
    print(T.text(s))

Wietse Coppes
Leo Jansen


In [17]:
for s in F.otype.s("sentence")[2:4]:
    A.pretty(s, withNodes=True)

In [18]:
for (i, s) in enumerate(F.otype.s("sentence")[0:100]):
    print(f"SENTENCE {i + 1}: {T.text(s)}")

SENTENCE 1: Brief aan Aletta de Iongh. 
SENTENCE 2: Amsterdam, dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909.
SENTENCE 3: Wietse Coppes
SENTENCE 4: Leo Jansen
SENTENCE 5: Mondriaan Editieproject
SENTENCE 6: Nederland
SENTENCE 7: Otterlo
SENTENCE 8: Kröller Müller Museum
SENTENCE 9: KM 123.397
SENTENCE 10: 19090216y_IONG_1303
SENTENCE 11: ​
SENTENCE 12: ​
SENTENCE 13: ​
SENTENCE 14: Piet Mondriaan
SENTENCE 15: dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909
SENTENCE 16: Amsterdam
SENTENCE 17: Aletta de Iongh
SENTENCE 18: transcriptie: voltooid 20.7.15
SENTENCE 19: collatie bron: 6.6.16
SENTENCE 20: tweede collatie aan het origineel: voltooid 26.11.19
SENTENCE 21: invoer tweede collatie: voltooid 5.8.16
SENTENCE 22: bespreking eindversie: gb
SENTENCE 23: markeren annotaties: in bewerking / voltooid
SENTENCE 24: gereed 17.4.2019
SENTENCE 25: titel gecontroleerd 21.09.2020
SENTENCE 26: personen getagd 12.10.2020
SENTENCE 27: vertaling ingevoerd 16.2.2021
SENTENC

# Illustrations

In [19]:
results = A.search("""
rs type=artwork-m ref~artwork
""")

  0.00s 14 results


In [20]:
A.table(results, withNodes=True, end=1)

n,p,rs
1,proeftuin@19090421y_IONG_1304:7,"19590 Piet Mondriaan, Leo Gestel, Meisjeskop, 1910 Alkmaar, Stedelijk Museum Alkmaar, inv./cat.nr. [..]. conté on papier. RKD 277201►Meisjeskop"


## The first letter

In [21]:
A.pretty(F.otype.s("letter")[0], full=True, withNodes=False)

## Pages

In [22]:
pages = A.search("""
page
""")
A.show(pages, end=2, full=False)

  0.00s 51 results


## Check the extra features

In [23]:
features = tuple(feat for feat in Fall() if Fs(feat).meta.get("conversionCode", None) == "tt")

for feat in features:
    meta = Fs(feat).meta
    print(f"{feat:<15}: {meta['conversionCode']}: {meta['conversionMethod']}")

artmondriaanref: tt: derived
correspondent  : tt: derived
country        : tt: derived
exhibitionref  : tt: derived
institution    : tt: derived
letterid       : tt: derived
location       : tt: derived
msid           : tt: derived
period         : tt: derived
periodlong     : tt: derived
personref      : tt: derived
sender         : tt: derived


In [25]:
for letter in F.otype.s("letter"):
    A.pretty(letter, extraFeatures=features)

In [28]:
for feature in ("personref", "artmondriaanref", "artref", "exhibitionref"):
    fObj = Fs(feature)
    if fObj:
        items = list(fObj.items())
        nItems = len(items)
        dm(f"### {feature} with {nItems} items\n\n")
        for (node, pref) in items[0:5]:
            A.pretty(node, extraFeatures=f"ref key {feature}")
    else:
        dm(f"### {feature} with 0 items\n\n")
        

### personref with 121 items



### artmondriaanref with 20 items



 1m 23s Node feature "artref" not loaded


### artref with 0 items



### exhibitionref with 16 items



# Link attributes

In [29]:
linkFeatures = [e for e in Eall() if e.startswith("link_")]
linkFeatures

['link_ref', 'link_target']

In [55]:
for feature in linkFeatures:
    att = feature[5:]
    eObj = Es(feature)
    if eObj:
        items = list(eObj.items())
        nItems = len(items)
        md = f"### {feature} with {nItems} items\n\n"

        for (nodeFrom, nodesTo) in items[0:5]:
            fType = F.otype.v(nodeFrom)
            val = Fs(att).v(nodeFrom)
            ref = f"*{att}*=`{val}`"
            md += f"""{fType} `{nodeFrom}` links via {ref} to\n\n"""
            for nodeTo in nodesTo:
                tType = F.otype.v(nodeTo)
                idv = F.id.v(nodeTo)
                md += f"""* {tType} `{nodeTo}` *id*=`{idv}`\n"""
            md += "\n"
        dm(md)
    else:
        dm(f"### {feature} with 0 items\n\n")

### link_ref with 10 items

rs `19598` links via *ref*=`artwork.xml#a68554` to

* artwork `17764` *id*=`a68554`

rs `19654` links via *ref*=`artwork.xml#a62319` to

* artwork `17769` *id*=`a62319`

rs `19655` links via *ref*=`artwork.xml#a68733` to

* artwork `17761` *id*=`a68733`

rs `19661` links via *ref*=`artwork.xml#a277201` to

* artwork `17762` *id*=`a277201`

rs `19687` links via *ref*=`artwork.xml#a194515` to

* artwork `17768` *id*=`a194515`



### link_target with 68 items

ptr `19352` links via *target*=`#Zus` to

* note `18991` *id*=`Zus`

ptr `19357` links via *target*=`#gekreukteEnveloppe` to

* note `19002` *id*=`gekreukteEnveloppe`

ptr `19358` links via *target*=`#jurySintLucas` to

* note `19003` *id*=`jurySintLucas`

ptr `19359` links via *target*=`#PaaschdagenArnhem` to

* note `19004` *id*=`PaaschdagenArnhem`

ptr `19363` links via *target*=`#theosofischeLezingen` to

* note `19017` *id*=`theosofischeLezingen`



Now a rigorous test

In [68]:
for feature in linkFeatures:
    att = feature[5:]
    eObj = Es(feature)
    
    
    if eObj:
        items = list(eObj.items())
        nItems = len(items)
        dm(f"### {feature} with {nItems} items linked by attribute {att}\n\n")

        oks = 0
        nonOks = []
        
        for (nodeFrom, nodesTo) in items:
            typeFrom = F.otype.v(nodeFrom)
            fileFrom = nodeFrom if typeFrom in {"file"} else L.u(nodeFrom, otype="file")[0]
            nameFrom = F.file.v(fileFrom)
            refVals = set()
            for refVal in set(Fs(att).v(nodeFrom).split()):
                parts = refVal.split("#", 1)
                if len(parts) == 1:
                    parts = (refVal, "")
                elif parts[0] == "":
                    parts[0] = f"{nameFrom}.xml"
                refVals.add("#".join(parts))
            
            idVals = set()
            for nodeTo in nodesTo:
                typeTo = F.otype.v(nodeTo)
                fileTo = nodeTo if typeTo in {"file"} else L.u(nodeTo, otype="file")[0]
                nameTo = F.file.v(fileTo)
                fileTo = F.file.v(nodeTo)
                idVal = "#".join((f"{nameTo}.xml", F.id.v(nodeTo) or ""))
                idVals.add(idVal)
                               
            thisOk = refVals == idVals
            if thisOk:
                oks += 1
            else:
                nonOks.append((nodeFrom, refVals, idVals))
                
        if len(nonOks) == 0:
            dm(f"All {oks} correctly linked\n")
        else:
            dm(f"{oks} correctly linked\n\n")
            dm(f"{len(nonOks)} problems:\n\n")
            for (nodeFrom, refVals, idVals) in nonOks:
                dm(f"* `{nodeFrom}`: `{refVals}` not equal to `{idVals}`\n")
            
    else:
        dm(f"### {feature} with 0 items\n\n")


### link_ref with 10 items linked by attribute ref



All 10 correctly linked


### link_target with 68 items linked by attribute target



All 68 correctly linked


## Overlapping divs

There are divs in divs
Let's find them all.

First the total amount of divs:

In [69]:
len(F.otype.s("div"))

31

In [70]:
query = """
d1:div
&& d2:div

d1 < d2
"""

resultsA = A.search(query)

  0.01s 0 results


We can also find the divs that are directly under another div by means of the `parent` edges:

In [71]:
query = """
div
<parent- div
"""

resultsD = A.search(query)

  0.00s 0 results


So some divs are nested, but not directly below each other.

Let's see which they are.

In [72]:
arbitrarily = set(resultsA)
directly = set(resultsD)

It is to be expected that the arbitrarily nested divs are a superset of the directly nested divs.

In [73]:
directly - arbitrarily

set()

Now the other way round:

In [74]:
results = arbitrarily - directly
results

set()

In [75]:
A.table(sorted(results), end=2)

In [76]:
query = """
div
<parent- div
<parent- div
"""
results = A.search(query)

  0.00s 0 results


In [77]:
from textwrap import dedent

In [78]:
for i in range(1, 5):
    query = dedent(
        f"""
        div
        -sibling>{i}> div
        """
    )

    print(f"div siblings at distance {i}")
    results = A.search(query)

div siblings at distance 1
  0.00s 2 results
div siblings at distance 2
  0.00s 0 results
div siblings at distance 3
  0.00s 0 results
div siblings at distance 4
  0.00s 0 results


### Notes

In [79]:
for (i, nn) in enumerate(F.otype.s("note")[4:5]):
    A.dm(f"### Note {i + 1}\n\n")
    tokens = L.d(nn, otype="token")
    s = L.u(L.d(nn, otype="token")[0], otype="chunk")[0]
    A.pretty(nn, withNodes=True, full=True)
    A.pretty(s, withNodes=True, full=True)

### Note 1

