In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tf.app import use
from tf.convert.tei import TEI
from tf.convert.addnlp import NLPipeline
from tf.advanced.helpers import dm

In [3]:
ORG = "annotation"
REPO = "mondriaan"

# Convert from TEI to TF

We show how to convert a TEI data source into TF.

This has two stages:

1. make an preliminary TF dataset with the character as slot type
1. feed the plain text to a tokenizer, and add tokens and sentences to the datset,
   while removing its character and word nodes;
   the new slot type is token.
   
A dataset based on characters is precise, but rather inefficient.
The second step makes the dataset much more efficient.

**More ways to do it!**

* [convertExpress](convertExpress.ipynb) : as few commands/feedback/interaction as possible, 
* [convertSteps](convertSteps.ipynb): broken down in a few command line commands, more feedback
* *convertDetails*: run from Python with full control

## Preliminary conversion

Same as in [convertSteps](convertSteps.ipynb) but now with even more feedback.

### Step 1: Check

Check the input: validity of the TEI-XML.

Make a report of the elements and attributes used.

Use the declared schemas in the XML source to determine which elements have
pure content and which ones mixed content.

In [4]:
Tei = TEI(verbose=-1, tei=0, tf="0.8.9pre")

In [5]:
Tei.task(check=True)

Start folder proeftuin:
  14 19100131_SAAL_ARNO_0018.xml                       
End   folder proeftuin

Validation OK
14 processing instructions encountered.
Namespaces OK


True

### Step 2: Convert

Run the actual conversin and produce TF output.

In [6]:
Tei.task(convert=True)

Start folder proeftuin:
  14 19100131_SAAL_ARNO_0018.xml                       
End   folder proeftuin



True

### Step 3: Load the TF data

The final proof that the conversion has worked is to load the data.
On first-time loading several checks and precomputations are performed.
Next time the loading will be much quicker.

In [7]:
Tei.task(load=True)

True

### Step 4: Configure a TF app

The TF app has configuration settings, a bit of custom code, and documentation.

Most of it will be generated now, but there are ways to keep custom additions intact.

In [8]:
Tei.task(app=True)

App updated


True

## View the preliminary result

In [9]:
Apre = use(f"{ORG}/{REPO}:clone", checkout="clone")

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
folder,1,62386.0,100
letter,14,4456.14,100
body,14,2169.43,49
text,14,2169.43,49
teiHeader,14,702.64,16
chunk,100,623.73,100
div,30,1011.47,49
standOff,14,1570.86,35
page,51,544.61,45
listAnnotation,46,478.09,35


We hoist the API handles of this dataset to the global scope.

In [10]:
Apre.hoist(globals())

### Show a fragment

In [11]:
chunk = F.otype.s("chunk")[4]
Apre.plain(chunk)

## Show the processing instructions

In [12]:
for nodeType in F.otype.all:
    if nodeType.startswith("?"):
        for n in F.otype.s(nodeType):
            Apre.pretty(n, multiFeatures=True)

## Check the extra features

In [13]:
features = tuple(feat for feat in Fall() if Fs(feat).meta.get("conversionCode", None) == "tt")

for feat in features:
    meta = Fs(feat).meta
    print(f"{feat:<15}: {meta['conversionCode']}: {meta['conversionMethod']}")

artmondriaanref: tt: derived
correspondent  : tt: derived
country        : tt: derived
exhibitionref  : tt: derived
institution    : tt: derived
letterid       : tt: derived
location       : tt: derived
msid           : tt: derived
period         : tt: derived
periodlong     : tt: derived
personref      : tt: derived
sender         : tt: derived


In [14]:
for letter in F.otype.s("letter"):
    Apre.pretty(letter, extraFeatures=features)

In [15]:
for feature in ("personref", "artmondriaanref", "artref", "exhibitionref"):
    fObj = Fs(feature)
    if fObj:
        items = list(fObj.items())
        nItems = len(items)
        dm(f"### {feature} with {nItems} items\n\n")
        for (node, pref) in items[0:5]:
            Apre.pretty(node, extraFeatures=f"ref key {feature}", baseTypes={"word"})
    else:
        dm(f"### {feature} with 0 items\n\n")
        

### personref with 112 items



### artmondriaanref with 18 items



  6.85s Node feature "artref" not loaded


### artref with 0 items



### exhibitionref with 15 items



## Memory resources

Check the memory usage per feature.

Keep an eye on the footprint of the `sibling` feature, because it might become too large
in a bigger corpus.

In [16]:
Apre.footprint()

                                                


# 70 features

feature | members | size in bytes
--- | --- | ---
__levUp__ | 75,109 | 4,962,896
ch | 62,386 | 4,373,868
oslots | 3 | 3,370,288
__boundary__ | 2 | 2,822,488
__order__ | 75,109 | 2,703,964
is_note | 25,415 | 2,022,470
__levDown__ | 12,723 | 1,528,432
is_meta | 11,356 | 907,930
str | 10,846 | 740,014
after | 10,846 | 605,101
sibling | 949 | 366,804
__rank__ | 75,109 | 319,304
parent | 1,641 | 288,604
otype | 4 | 105,793
extraspace | 804 | 59,514
empty | 464 | 31,504
type | 386 | 31,214
id | 320 | 22,884
rend_italics | 308 | 17,956
n | 277 | 17,766
rend_underline | 212 | 15,268
__levels__ | 69 | 14,897
__characters__ | 1 | 14,559
target | 139 | 14,164
__sections__ | 2 | 10,785
personref | 112 | 9,575
ref | 112 | 9,575
who | 127 | 8,550
chunk | 100 | 7,712
url | 63 | 7,279
rend_upsidedown | 90 | 7,236
rend | 72 | 5,152
lang | 74 | 4,438
facs | 48 | 4,036
f | 46 | 3,654
key | 34 | 3,575
rend_spaced | 38 | 2,260
rend_blockletter | 34 | 2,148
periodlong | 14 | 2,113
letter | 14 | 1,977
letterid | 14 | 1,977
msid | 14 | 1,972
artmondriaanref | 18 | 1,950
period | 14 | 1,720
when | 14 | 1,720
exhibitionref | 15 | 1,397
institution | 14 | 1,397
correspondent | 14 | 1,369
location | 14 | 1,258
form | 14 | 1,206
country | 14 | 1,189
sender | 14 | 1,087
template | 14 | 1,079
adaptation | 14 | 1,075
rend_super | 12 | 996
unit | 9 | 712
quantity | 9 | 704
dim | 9 | 661
rend_above | 8 | 604
rend_center | 6 | 548
rend_right | 6 | 548
rend_right_underline | 6 | 548
place | 3 | 471
rend_underline2 | 5 | 392
rend_norend | 4 | 364
reason | 2 | 339
folder | 1 | 310
rend_overwritten | 2 | 308
rend_super_underline2 | 2 | 308
rend_super_underline | 1 | 280
TOTAL | 365,744 | 25,476,236

## Add tokens and sentences

We add tokens and sentences to the TF dataset.

We do this by the following steps

1. Generate a plain text plus mapping between character positions and nodes
2. Use Spacy to tokenize the text and to determine sentence boundaries
3. translate the Spacy results back to extra nodes and features for the TF set
4. replace the character slots in the TF set by tokens

### Step by step from Python

We carry out the steps from within Python.

In that way we get access to all intermediate results, and we can play and explore between the steps.

We load the data we have so far, and pass it on to an `NLPipeline` object, defined by Text-Fabric.

### Back to the previous state

When we have added the data to the dataset, we will tweak the TF app.

But if we want to redo the pipeline, we have to restore the app to the situation before
the tokens and sentences were added.

That's the reason we have the next cell.

In [17]:
Tei.task(app=True)

App updated


True

In [18]:
Apre = use(f"{ORG}/{REPO}:clone", checkout="clone", hoist=globals())
NLP = NLPipeline(lang="en", verbose=0, write=True)
NLP.loadApp(Apre)

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
folder,1,62386.0,100
letter,14,4456.14,100
body,14,2169.43,49
text,14,2169.43,49
teiHeader,14,702.64,16
chunk,100,623.73,100
div,30,1011.47,49
standOff,14,1570.86,35
page,51,544.61,45
listAnnotation,46,478.09,35


Input data has version 0.8.9pre
Compute element boundaries
  1210 start postions
  1437 end postions


### Before the steps

We can set the verbosity as we like.

Generate plain text (add `verbose=-1` or `0` or `1` and/or `write=True` if you like).

* `verbose=-1` is the same as `-verbose`
* `verbose=0` is the same as `+verbose`
* `verbose=1` is the same as `++verbose`

### Step 1: Generate a plain text of the whole corpus

The function delivers the text in a variable, and it has recorded which character positions correspond
to which slots in the TF dataset.

We receive both items of data.

In [19]:
(text, positions) = NLP.task(plaintext=True)

Input data has version 0.8.9pre
Compute element boundaries
  1210 start postions
  1437 end postions
  0.00s Generating a plain text with positions ...
Analysing ~/github/annotation/text-fabric/tf/tools/tei/tei_all.xsd
   |   Found 232 empty slots
   |   recorded flow main       with 122399 items
   |   recorded flow del        with    159 items
   |   recorded flow note       with  66421 items
   |   recorded flow orig       with    104 items
  0.18s Done. Generated text and positions written to ~/github/annotation/mondriaan/_temp/txt/plain.txt


### Step 2: Run Spacy to get tokens and sentences

Now we feed the text from step 1 into the NLP pipeline, which is Spacy.

We get a list of tokens and a list of sentences back.

In [20]:
(tokens, sentences) = NLP.task(lingo=True, text=text)

Input data has version 0.8.9pre
Compute element boundaries
  1210 start postions
  1437 end postions
  0.00s Using NLP pipeline Spacy (en) ...
  2.95s Atomic tokens written to ~/github/annotation/mondriaan/_temp/txt/tokens.tsv
  2.95s Sentences written to ~/github/annotation/mondriaan/_temp/txt/sentences.tsv
  2.95s NLP done


Let's examine a few tokens and sentences:

In [21]:
for token in tokens[1022:1032]:
    print(token)

(3995, 4000, 'groot', ' ')
(4001, 4005, 'deel', ' ')
(4006, 4009, 'van', ' ')
(4010, 4012, 'de', ' ')
(4013, 4017, 'week', ' ')
(4018, 4025, 'geweest', '')
(4025, 4026, '.', ' ')
(4027, 4029, 'Ik', ' ')
(4030, 4033, 'woû', ' ')
(4034, 4036, 'je', ' ')


In [22]:
for token in tokens[1211:1221]:
    print(token)

(4707, 4709, 'to', ' ')
(4710, 4714, 'come', ' ')
(4715, 4718, 'and', ' ')
(4719, 4722, 'see', ' ')
(4723, 4726, 'you', ' ')
(4727, 4734, 'between', ' ')
(4735, 4736, '4', ' ')
(4737, 4740, 'and', ' ')
(4741, 4742, '5', '')
(4742, 4743, ',', ' ')


Each token entry specifies the start and end position in the plain text file,
the string value of the token, and the whitespace after the token, if any.

In [23]:
for sentence in sentences[136:155]:
    print(sentence)

(3978, 4026, 'Ik ben er al een groot deel van de week geweest.')
(4027, 4231, 'Ik woû je wel graag tusschen 4 - 5 even komen opzoeken, anders zie ik je weer in een week denkelijk niet, want ik ga met de Paaschdagen naar Arnhem.￮ Ik kom dus morgen even zien of je thuis ben.  xxx. ')
(4231, 4239, 'Aa bb. ')
(4239, 4248, 'End p. ')
(4248, 4252, 'Dag!')
(4253, 4290, 'Veel beste groeten je Piet.  xxx. ')
(4290, 4298, 'Aa bb. ')
(4298, 4319, 'End div.   xxx. ')
(4319, 4327, 'Aa bb. ')
(4327, 4342, 'End chunk.  ')
(4342, 4350, 'Aa bb. ')
(4350, 4628, 'Begin chunk. Dear Zus,I’m sorry I’m sending you such a crumpled envelope,￮ but I didn’t have any more in the house, and I had to tell you that unfortunately I have to be in the museum tomorrow afternoon when the paintings are hung because I’m on the St Lucas jury.￮  xxx. ')
(4628, 4636, 'Aa bb. ')
(4636, 4645, 'End p. ')
(4645, 4697, 'I’ve already been there for a good part of the week.')
(4698, 4900, 'I’d like to come and see you between 4 and 

Sentence entries have the same fields, except for the last whitespace field.

Actually, the program will not use the texts of tokens and sentences for display, only for determining
where the boundaries are.

With those boundaries in hand, the texts of tokens and sentences are read off from the original corpus.

### Step 3: Ingest the results in the data set

A lot of critical things happen when we ingest the token and sentence streams into our dataset.

We calculate slot positions, retrieve text, split some tokens, and last but not least,
we replace the character-by-character basis of the preliminary dataset by a token-by-token basis.

In [24]:
Apre.pretty(L.u(2957, otype="word")[0], multiFeatures=True)

In [25]:
newVersion = NLP.task(
    ingest=True,
    positions=positions,
    tokens=tokens,
    sentences=sentences,
)

Input data has version 0.8.9pre
Compute element boundaries
  1210 start postions
  1437 end postions
  0.00s Ingesting tokens, and sentences into the dataset ...
   |     8.69s Mapping NLP data to nodes and features ...
   |      |     0.00s generating t-nodes with features str, after, empty
   |      |      |     0.00s 13405 t nodes have values assigned for str, after
   |      |      |     0.00s 232 empty slots have surrounding split tokens
   |      |      |     0.00s 68 space slots have split into chars
   |      |      |     0.00s 1600 slots have split around an element boundary
   |      |      |     0.00s  6497x Items contained in extra generated text
   |      |     0.06s 13017 tokens
   |      |     0.06s 13405 ts
   |      |     0.06s generating sentence-nodes with features nsent
   |      |      |     0.02s 1050 sentence nodes have values assigned for nsent
   |      |      |     0.02s   977x Items contained in extra generated text
   |      |      |     0.02s    61x Items w

### Step 4: Adjust the app to the modified dataset

Various things in the `config.yaml` and `app.py` of the TF app should be updated, as well
as the documentation file that gives the ins and outs of the resulting features.

In [26]:
Tei.task(apptoken=True)

App updated with tokens and sentences 


True

# Use the new dataset

We can now use the resulting dataset in the usual way.
Because we have adapted the TF app, the version without the `pre` will now be loaded.

In [27]:
A = use(f"{ORG}/{REPO}:clone", checkout="clone", silent="verbose")

**Locating corpus resources ...**

This is Text-Fabric 11.4.16
62 features found and 0 ignored
   |     0.02s T otype                from ~/github/annotation/mondriaan/tf/0.8.9
   |     0.19s T oslots               from ~/github/annotation/mondriaan/tf/0.8.9
  0.21s Dataset without structure sections in otext:no structure functions in the T-API
   |     0.00s T chunk                from ~/github/annotation/mondriaan/tf/0.8.9
   |     0.05s T after                from ~/github/annotation/mondriaan/tf/0.8.9
   |     0.00s T letter               from ~/github/annotation/mondriaan/tf/0.8.9
   |     0.06s T str                  from ~/github/annotation/mondriaan/tf/0.8.9
   |     0.00s T folder               from ~/github/annotation/mondriaan/tf/0.8.9
   |      |     0.00s C __levels__           from otype, oslots, otext
   |      |     0.13s C __order__            from otype, oslots, __levels__
   |      |     0.01s C __rank__             from otype, __order__
   |      |     0.26s C __levUp__            from otype, oslots,

Name,# of nodes,# slots/node,% coverage
folder,1,13405.0,100
letter,14,957.5,100
body,14,499.5,52
text,14,499.5,52
chunk,100,133.92,100
div,30,232.17,52
standOff,14,314.0,33
teiHeader,14,132.57,14
page,51,125.73,48
listAnnotation,46,95.57,33


We hoist the API handles of this dataset to the global scope.

In [28]:
A.hoist(globals())

## Memory resources (revisited)

We now have a leaner dataset, because the granularity has become coarser: from character
to token.

In [29]:
A.footprint()

                                                


# 69 features

feature | members | size in bytes
--- | --- | ---
__levUp__ | 29,349 | 5,495,104
str | 26,422 | 2,196,373
after | 26,042 | 2,040,075
__boundary__ | 2 | 1,982,352
__levDown__ | 15,944 | 1,909,040
oslots | 3 | 1,893,612
__order__ | 29,349 | 1,056,604
sibling | 949 | 366,804
parent | 1,641 | 288,604
is_note | 4,382 | 270,290
otype | 4 | 131,616
is_meta | 1,856 | 125,826
__rank__ | 29,349 | 121,180
nsent | 1,050 | 95,752
extraspace | 783 | 58,926
type | 386 | 31,214
id | 320 | 22,884
n | 277 | 17,766
empty | 232 | 15,800
__levels__ | 70 | 15,056
__characters__ | 1 | 14,655
target | 139 | 14,164
__sections__ | 2 | 10,757
personref | 112 | 9,575
ref | 112 | 9,575
who | 127 | 8,550
chunk | 100 | 7,712
url | 63 | 7,279
rend | 72 | 5,152
lang | 74 | 4,438
facs | 48 | 4,036
rend_underline | 59 | 3,944
f | 46 | 3,654
rend_italics | 47 | 3,608
key | 34 | 3,575
periodlong | 14 | 2,113
msid | 14 | 2,001
letter | 14 | 1,977
letterid | 14 | 1,977
artmondriaanref | 18 | 1,950
period | 14 | 1,720
when | 14 | 1,720
institution | 14 | 1,473
exhibitionref | 15 | 1,397
correspondent | 14 | 1,385
location | 14 | 1,258
form | 14 | 1,206
country | 14 | 1,189
rend_upsidedown | 17 | 1,136
sender | 14 | 1,087
template | 14 | 1,079
adaptation | 14 | 1,075
unit | 9 | 712
quantity | 9 | 704
dim | 9 | 661
rend_blockletter | 6 | 548
place | 3 | 471
rend_super | 5 | 392
rend_spaced | 4 | 364
reason | 2 | 339
folder | 1 | 310
rend_above | 2 | 308
rend_center | 2 | 308
rend_overwritten | 2 | 308
rend_right | 2 | 308
rend_right_underline | 2 | 308
rend_super_underline2 | 2 | 308
rend_super_underline | 1 | 280
rend_underline2 | 1 | 280
TOTAL | 169,754 | 18,278,204

# Exploration

We walk around a bit more in the corpus.

## All titles:

In [30]:
for t in F.otype.s("titleStmt"):
    print(t, T.text(t))

15251 Brief aan Aletta de Iongh. Amsterdam, dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15252 Brief aan Aletta de Iongh. Amsterdam, woensdag 7 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15253 Brief aan Aletta de Iongh. Amsterdam, tussen maandag 19 en vrijdag 23 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15254 Brief aan Aletta de Iongh. Amsterdam, maandag 26 april 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15255 Brief aan Aletta de Iongh. Amsterdam, donderdag 13 mei 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15256 Brief aan Aletta de Iongh. Amsterdam, donderdag 24 juni 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15257 Brief aan Aletta de Iongh. Amsterdam, eerste helft augustus 1909.
Wietse Coppes
Leo Jansen
Mondriaan Editieproject

15258  Briefkaart aan Gerrit Willem Knap. Zoutelande, c. dinsdag 24 augustus 1909.
Wietse Coppes
Leo Jansen
Mondria

# Tokens

Show all the tokens that are split into atomic tokens

In [34]:
query = """
token
  =: t
  <: t
"""

results = A.search(query)

  0.02s 10 results


In [35]:
A.table(results, condenseType="token")

n,p,token,t,t.1
1,proeftuin@19090407y_IONG_1739:5,S t,S,t
2,proeftuin@19090426y_IONG_1738:5,1 e,1,e
3,proeftuin@19090513y_IONG_1293:5,S t,S,t
4,proeftuin@19090624_IONG_1294:6,k a,k,a
5,proeftuin@19090824y_KNAP_1747:5,42II,42,II
6,proeftuin@19090824y_KNAP_1747:6,Vr. n,Vr.,n
7,proeftuin@19090905y_IONG_1295:5,4.a,4,.a
8,proeftuin@19091024y_IONG_1297:5,Nov.ber,Nov.,ber
9,proeftuin@19100131_SAAL_ARNO_0018:5,onjuist,on,juist
10,proeftuin@19100131_SAAL_ARNO_0018:5,een,e,en


## Sentences

In [36]:
for s in F.otype.s("sentence")[2:4]:
    print(T.text(s))

Amsterdam, dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909.
Wietse Coppes


In [37]:
for s in F.otype.s("sentence")[2:4]:
    A.pretty(s, withNodes=True)

In [38]:
for (i, s) in enumerate(F.otype.s("sentence")[0:100]):
    print(f"SENTENCE {i + 1}: {T.text(s)}")

SENTENCE 1: ​
SENTENCE 2: Brief aan Aletta de Iongh. 
SENTENCE 3: Amsterdam, dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909.
SENTENCE 4: Wietse Coppes
SENTENCE 5: Leo Jansen
SENTENCE 6: Mondriaan Editieproject
SENTENCE 7: Nederland
SENTENCE 8: Otterlo
SENTENCE 9: Kröller Müller Museum
SENTENCE 10: KM 123.397
SENTENCE 11: 19090216y_IONG_1303
SENTENCE 12: ​
SENTENCE 13: ​
SENTENCE 14: ​
SENTENCE 15: Piet Mondriaan
dinsdag 16 februari, dinsdag 2 maart of dinsdag 9 maart 1909
Amsterdam
Aletta de Iongh
SENTENCE 16: transcriptie: voltooid 20.7.15
collatie bron: 6.6.16
tweede collatie aan het origineel: voltooid 26.11.19
invoer tweede collatie: voltooid 5.8.16
bespreking eindversie: gb
markeren annotaties: in bewerking / voltooid
gereed 17.4.2019
titel gecontroleerd 21.09.2020
personen getagd 12.10.2020
vertaling ingevoerd 16.2.2021
codering personen aangepast 16.2.2022
controle/aanpassing afkortingen en emendaties 21.4.2023
SENTENCE 17: ​​
​
​
​
SENTENCE 18: Beste Zus,
​kom je 

# Illustrations

In [39]:
results = A.search("""
rs type=artwork-m key~[0-9]
""")

  0.00s 12 results


In [40]:
A.show(results, withNodes=True,end=1)

## The first letter

In [41]:
A.pretty(F.otype.s("letter")[0], full=True, withNodes=False)

## Pages

In [42]:
pages = A.search("""
page
""")
A.table(pages, end=2)

  0.00s 51 results


n,p,page
1,proeftuin@19090216y_IONG_1303:5,"Beste Zus, ​kom je morgenavond (Woensdag) om kwart voor acht ingang kleine zaal Concertgebouw​, dan heb ik een plaats voor v ​. Bulhlig voor je.​En dan kunnen we een andere dan Donderdagmiddag afspreken want dan kan ik niet goed. Met vele beste groeten je Piet."
2,proeftuin@19090216y_IONG_1303:6,"Dear Zus,​ ​If you come to the entrance to the small auditorium in the Concertgebouw at a quarter to eight tomorrow (Wednesday) evening, I have a ticket for van Buhlig for you.​And then we can arrange a time other than Thursday afternoon because I can’t manage that. With my very best wishes, your Piet."


## Overlapping divs

There are divs in divs
Let's find them all.

First the total amount of divs:

In [43]:
len(F.otype.s("div"))

30

In [44]:
query = """
d1:div
&& d2:div

d1 < d2
"""

resultsA = A.search(query)

  0.01s 0 results


We can also find the divs that are directly under another div by means of the `parent` edges:

In [45]:
query = """
div
<parent- div
"""

resultsD = A.search(query)

  0.00s 0 results


So some divs are nested, but not directly below each other.

Let's see which they are.

In [46]:
arbitrarily = set(resultsA)
directly = set(resultsD)

It is to be expected that the arbitrarily nested divs are a superset of the directly nested divs.

In [47]:
directly - arbitrarily

set()

Now the other way round:

In [48]:
results = arbitrarily - directly
results

set()

In [49]:
A.table(sorted(results), end=2)

In [50]:
query = """
div
<parent- div
<parent- div
"""
results = A.search(query)

  0.00s 0 results


In [51]:
from textwrap import dedent

In [52]:
for i in range(1, 5):
    query = dedent(
        f"""
        div
        -sibling>{i}> div
        """
    )

    print(f"div siblings at distance {i}")
    results = A.search(query)

div siblings at distance 1
  0.00s 2 results
div siblings at distance 2
  0.00s 0 results
div siblings at distance 3
  0.00s 0 results
div siblings at distance 4
  0.00s 0 results


### Notes

In [53]:
for (i, nn) in enumerate(F.otype.s("note")[4:5]):
    Apre.dm(f"### Note {i + 1}\n\n")
    tokens = L.d(nn, otype="token")
    s = L.u(L.d(nn, otype="token")[0], otype="chunk")[0]
    A.pretty(nn, withNodes=True, full=True)
    A.pretty(s, withNodes=True, full=True)

### Note 1

