/
tei.py
4315 lines (3573 loc) · 144 KB
/
tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
# TEI import
You can convert any TEI source into TF by specifying a few details about the source.
TF then invokes the `tf.convert.walker` machinery to produce a TF
dataset out of the source.
TF knows the TEI elements, because it will read and parse the complete
TEI schema. From this the set of complex, mixed elements is distilled.
If the TEI source conforms to a customised TEI schema, it will be detected and
the importer will read it and override the generic information of the TEI elements.
It is also possible to pass a choice of template and adaptation in a processing
instruction. This does not influence validation, but it may influence further
processing.
If the TEI consists of multiple source files, it is possible to specify different
templates and adaptations for different files.
The possible values for models, templates, and adaptations should be declared in
the configuration file.
For each model there should be corresponding schema in the schema directory,
either an RNG or an XSD file.
The converter goes the extra mile: it generates a TF app and documentation
(an *about.md* file and a *transcription.md* file), in such a way that the TF
browser is instantly usable.
The TEI conversion is rather straightforward because of some conventions
that cannot be changed.
# Configuration and customization
We assume that you have a `programs` directory at the top-level of your repo.
In this directory we'll look for two optional files:
* a file `tei.yaml` in which you specify a bunch of values.
get the conversion off the ground.
* a file `tei.py` in which you define custom functions that are executed at certain
specific hooks:
* `transform(text)` which takes a text string argument and delivers a
text string as result. The converter will call this on every TEI input
file it reads *before* feeding it to the XML parser.
* `beforeTag`: just before the walker starts processing the start tag of
a TEI element;
* `beforeChildren`: just after processing the start tag, but before processing
the element content (text and child elements);
* `afterChildren`: just after processing the complete element content
(text and child elements), but before processing the end tag of the
TEI element;
* `afterTag`: just after processing the end tag of a TEI element.
The `before` and `after` functions should take the following arguments
* `cv`: the walker converter object;
* `cur`: the dictionary with information that has been gathered during the
conversion so far and that can be used to dump new information
into; it is nonlocal, i.e. all invocations of the hooks get the same
dictionary object passed to them;
* `xnode`: the LXML node corresponding to the TEI element;
* `tag`: the tag name of the element, without namespaces;
this is a bit redundant, because it can also be extracted from
the `xnode`, but it is convenient.
* `atts`: the attributes (names and values) of the element,
without namespaces;
this is a bit redundant, because it can also be extracted from
the `xnode`, but it is convenient.
These functions should not return anything, but they can write things to
the `cur` dictionary.
And they can create slots, nodes, and terminate them, in short, they
can do every `cv`-based action that is needed.
You can define these functions out of this context, but it is good to know
what information in `cur` is guaranteed to be available:
* `xnest`: the stack of XML tag names seen at this point;
* `tnest`: the stack of TF nodes built at this point;
* `tsiblings` (only if sibling nodes are being recorded): the list of
preceding TF nodes corresponding to the TEI sibling elements of the
current TEI element.
## Keys and values of the `tei.yaml` file
### `generic`
dict, optional `{}`
Metadata for all generated TF features.
The actual source version of the TEI files does not have to be stated here,
it will be inserted based on the version that the converter will actually use.
That version depends on the `tei` argument passed to the program.
The key under which the source version will be inserted is `teiVersion`.
### `extra`
dict, optional `{}`
Instructions and metadata for specific generated TF features, namely those that
have not been generated by the vanilla TEI conversion, but by extra code in one
of the customised hooks.
The dict is keyed by feature name, the values are again dictionaries.
These value dictionaries have a key meta under which any number of metadata key value
pairs, such as `description="xxx"`.
If you put the string «base» in such a field, it will be expanded on the
basis of the contents of the `path` key, see below.
You must provide the key `valueType` and pass `int` or `str` there, depending on the
values of the feature.
You may provide extra keys, such as `conversionMethod="derived"`, so that other programs
can determine what to do with these features.
The information in this dict will also end up in the generated feature docs.
Besides the `meta` key, there may also be the keys `path`, and `nodeType`.
Together they contain an instruction to produce a feature value from element content
that can be found on the current stack of XML nodes and attributes.
The value found will be put in the feature in question
for the node of type specified in `nodeType` that is recently constructed.
Example:
``` yaml
extra:
letterid:
meta:
description: The identifier of a letter; «base»
valueType: str
conversionMethod: derived
conversionCode: tt
path:
- idno:
type: letterId
- altIdentifier
- msIdentifier
- msDesc
- sourceDesc
nodeType: letter
feature: letterid
```
The meaning is:
* if, while parsing the XML, I encounter an element `idno`,
* and if that element has an attribute `type` with value `letterId`,
* and if it has parent `altIdentifier`,
* and grandparent `msIdentifier`,
* and great-grandparent `msDesc`,
* and great-great-grandparent `sourceDesc`,
* then look up the last created node of type `letter`
* and get the text content of the current XML node (the `idno` one),
* and put it in the feature `letterid` for that node.
* Moreover, the feature `letterid` gets metadata as specified under the key `meta`,
where the `description` will be filled with the text
The identifier of a letter; the content is taken from sourceDesc/msDesc/msIdentifier/altIdentifier/idno[type=letterId]
### `models`
list, optional `[]`
Which TEI-based schemas are to be used.
For each model there should be an XSD or RNG file with that name in the `schema`
directory. The `tei_all` schema is known to TF, no need to specify that one.
We'll try a RelaxNG schema (`.rng`) first. If that exists, we use it for validation
with JING, and we also convert it with TRANG to an XSD schema, which we use for
analysing the schema: we want to know which elements are mixed and pure.
If there is no RelaxNG schema, we try an XSD schema (`.xsd`). If that exists,
we can do the analysis, and we will use it also for validation.
!!! note "Problems with RelaxNG validation"
RelaxNG validation is not always reliable when performed with LXML, or any tool
based on `libxml`, for that matter. That's why we try to avoid it. Even if we
translate the RelaxNG schema to an XSD schema by means of TRANG, the resulting
validation is not always reliable. So we use JING to validate the RelaxNG schema.
See also [JING-TRANG](https://code.google.com/archive/p/jing-trang/downloads).
### `templates`
list, optional `[]`
Which template(s) are to be used.
A template is just a keyword, associated with an XML file, that can be used to switch
to a specific kind of processing, such as `letter`, `bibliolist`, `artworklist`.
You may specify an element or processing instruction with an attribute
that triggers the template for the file in which it is found.
This will be retrieved from the file before XML parsing starts.
For example,
``` python
templateTrigger="?editem@template"
```
will read the file and extract the value of the `template` attribute of the `editem`
processing instruction and use that as the template for this file.
If no template is found in this way, the empty template is assumed.
### `adaptations`
list, optional `[]`
Which adaptations(s) are to be used.
An adaptation is just a keyword, associated with an XML file, that can be used to switch
to a specific kind of processing.
It is meant to trigger tweaks on top of the behaviour of a template.
You may specify an element or processing instruction with an attribute
that triggers the adaptation for the file in which it is found.
This will be retrieved from the file before XML parsing starts.
For example,
``` python
adaptationTrigger="?editem@adaptation"
```
will read the file and extract the value of the `adaptation` attribute of the `editem`
processing instruction and use that as the adaptation for this file.
If no adaptation is found in this way, the empty adaptation is assumed.
### `prelim`
boolean, optional `True`
Whether to work with the `pre` TF versions.
Use this if you convert TEI to a preliminary TF dataset, which will
receive NLP additions later on. That version will then lose the `pre`.
### `wordAsSlot`
boolean, optional `False`
Whether to take words as the basic entities (slots).
If not, the characters are taken as basic entities.
If you use an NLP pipeline to detect tokens, use the value `False`.
The preliminary dataset is then based on characters, but the final dataset that we build
from there is based on tokens, which are mostly words and non-word characters.
### `parentEdges`
boolean, optional `True`
Whether to create edges between nodes that correspond to XML elements and their parents.
### `siblingEdges`
boolean, optional `False`
Whether to create edges between nodes that correspond to XML elements and siblings.
Edges will be created between each sibling and its *preceding* siblings.
If you use these edges in the binary way, you can also find the following siblings.
The edges are labeled with the distance between the siblings, adjacent siblings
get distance 1.
!!! caution "Overwhelming space requirement"
If the corpus is divided into relatively few elements that each have very many
direct children, the number of sibling edges is comparable to the size of the
corpus squared. That means that the TF dataset will consist for 50-99% of
sibling edges!
An example is [`ETCBC/nestle1904`](https://github.com/ETCBC/nestle1904) (Greek New
Testament) where each book element has all of its sentences as direct children.
In that dataset, the siblings would occupy 40% of the size, and we have taken care
not to produce sibling edges for sentences.
### `pageModel`
dict, optional `False`
If not passed, or an empty dict, page model I is assumed.
A page model must be specified with the parameters relevant for the
model:
``` python
dict(
model="I",
)
```
(model I does not require any parameters)
or
``` python
dict(
model="II",
element="div",
attributes=dict(type=["original", "translation"]),
pbAtTop=True,
nodeType="page",
)
```
For model II, the default parameters are:
``` python
element="div",
pbAtTop=True,
nodeType="page",
attributes={},
```
Model I is the default, and nothing special happens to the `<pb>` elements.
In model II the `<pb>` elements translate to nodes of type `page`, which span
content, whereas the original `pb` elements just mark positions.
Instead of `page`, you can also specify another node type by the parameter `element`.
We assume that the material that the `<pb>` elements divide up is the material
that corresponds to their `<div>` parent element. Instead of `<div>`,
you can also specify another element in the parameter `element`.
If you want to restrict the parent elements of pages, you can do so by specifying
attributes, like `type="original"`. Then only parents that carry those attributes
will be chopped up into pages.
You can specify multiple values for each attribute. Elements that carry one of these
values are candidates for having their content divided into pages.
We assume that the material to be divided starts with a `<pb>` (as the TEI-guidelines
prescribe) and we translate it to a page element that we close either at the
next `<pb>` or at the end of the `div`.
But if you specify `pbAtTop=False`, we assume that the `<pb>` marks the end of
the corresponding page element. We start the first page at the start of the enclosing
element. If there is material at between the last `<pb>` till the end of the enclosing
element, we generate an extra page node without features.
### `procins`
boolean, optional `False`
If True, processing instructions will be treated.
Processing instruction `<?foo bar="xxx"?>` will be converted as if it were an empty
element named `foo` with attribute `bar` with value `xxx`.
### `sectionModel`
dict, optional `{}`
If not passed, or an empty dict, section model I is assumed.
A section model must be specified with the parameters relevant for the
model:
``` python
dict(
model="II",
levels=["chapter", "chunk"],
element="head",
attributes=dict(rend="h3"),
)
```
(model I does not require the *element* and *attribute* parameters)
or
``` python
dict(
model="I",
levels=["folder", "file", "chunk"],
)
```
This section model (I) accepts a few other parameters:
``` python
backMatter="backmatter"
```
This is the name of the folder that should not be treated as an ordinary folder, but
as the folder with the sources for the back-matter, such as references, lists, indices,
bibliography, biographies, etc.
``` python
drillDownDivs=True
```
Whether the chunks are the immediate children of `body` elements, or whether
we should drill through all intervening `div` levels.
For model II, the default parameters are:
``` python
element="head"
levels=["chapter", "chunk"],
attributes={}
```
In model I, there are three section levels in total.
The corpus is divided in folders (section level 1), files (section level 2),
and chunks within files. The parameter `levels` allows you to choose names for the
node types of these section levels.
In model II, there are 2 section levels in total.
The corpus consists of a single file, and section nodes will be added
for nodes at various levels, mainly outermost `<div>` and `<p>` elements and their
siblings of other element types.
The section heading for the second level is taken from elements in the neighbourhood,
whose name is given in the parameter `element`, but only if they carry some attributes,
which can be specified in the `attributes` parameter.
# Usage
## Command-line
``` sh
tf-fromtei tasks flags
```
## From Python
``` python
from tf.convert.tei import TEI
T = TEI()
T.task(**tasks, **flags)
```
For a short overview the tasks and flags, see `HELP`.
## Tasks
We have the following conversion tasks:
1. `check`: makes and inventory of all XML elements and attributes used.
1. `convert`: produces actual TF files by converting XML files.
1. `load`: loads the generated TF for the first time, by which the pre-computation
step is triggered. During pre-computation some checks are performed. Once this
has succeeded, we have a workable TF dataset.
1. `app`: creates or updates a corpus specific TF app with minimal sensible settings,
plus basic documentation.
1. `apptoken`: updates a corpus specific TF app from a character-based dataset
to a token-based dataset.
1. `browse`: starts the TF browser on the newly created dataset.
Tasks can be run by passing any choice of task keywords to the
`TEI.task()` method.
## Note on versions
The TEI source files come in versions, indicated with a data.
The converter picks the most recent one, unless you specify an other one:
``` python
tf-from-tei tei=-2 # previous version
tf-from-tei tei=0 # first version
tf-from-tei tei=3 # third version
tf-from-tei tei=2019-12-23 # explicit version
```
The resulting TF data is independently versioned, like `1.2.3` or `1.2.3pre`.
When the converter runs, by default it overwrites the most recent version,
unless you specify another one.
It looks at the latest version and then bumps a part of the version number.
``` python
tf-fromtei tf=3 # minor version, 1.2.3 becomes 1.2.4; 1.2.3pre becomes 1.2.4pre
tf-fromtei tf=2 # intermediate version, 1.2.3 becomes 1.3.0
tf-fromtei tf=1 # major version, 1.2.3 becomes 2.0.0
tf-fromtei tf=1.8.3 # explicit version
```
## Examples
Exactly how you can call the methods of this module is demonstrated in the small
corpus of 14 letter by the Dutch artist Piet Mondriaan.
* [Mondriaan](https://nbviewer.org/github/annotation/mondriaan/blob/master/programs/convertExpress.ipynb).
"""
import sys
import collections
import re
from textwrap import dedent, wrap
from io import BytesIO
from subprocess import run
from importlib import util
from ..capable import CheckImport
from .helpers import (
setUp,
tweakTrans,
checkModel,
matchModel,
lookupSource,
NODE,
FILE,
PRE,
ZWSP,
XNEST,
TNEST,
TSIB,
WORD,
CHAR,
CONVERSION_METHODS,
CM_LIT,
CM_LITP,
CM_LITC,
CM_PROV,
)
from ..parameters import BRANCH_DEFAULT_NEW
from ..fabric import Fabric
from ..core.helpers import console, versionSort, mergeDict
from ..convert.walker import CV
from ..core.timestamp import AUTO, DEEP, TERSE
from ..core.command import readArgs
from ..core.files import (
fileOpen,
abspath,
expanduser as ex,
unexpanduser as ux,
getLocation,
initTree,
fileNm,
dirNm,
dirExists,
dirContents,
fileExists,
fileCopy,
scanDir,
readYaml,
writeYaml,
)
from ..tools.xmlschema import Analysis
(HELP, TASKS, TASKS_EXCLUDED, PARAMS, FLAGS) = setUp("TEI")
CSS_REND = dict(
h1=(
"heading of level 1",
dedent(
"""
font-size: xx-large;
font-weight: bold;
margin-top: 3rem;
margin-bottom: 1rem;
"""
),
),
h2=(
"heading of level 2",
dedent(
"""
font-size: x-large;
font-weight: bold;
margin-top: 2rem;
margin-bottom: 1rem;
"""
),
),
h3=(
"heading of level 3",
dedent(
"""
font-size: large;
font-weight: bold;
margin-top: 1rem;
margin-bottom: 0.5rem;
"""
),
),
h4=(
"heading of level 4",
dedent(
"""
font-size: large;
font-style: italic;
margin-top: 1rem;
margin-bottom: 0.5rem;
"""
),
),
h5=(
"heading of level 5",
dedent(
"""
font-size: medium;
font-weight: bold;
font-variant: small-caps;
margin-top: 0.5rem;
margin-bottom: 0.25rem;
"""
),
),
h6=(
"heading of level 6",
dedent(
"""
font-size: medium;
font-weight: normal;
font-variant: small-caps;
margin-top: 0.25rem;
margin-bottom: 0.125rem;
"""
),
),
italic=(
"cursive font style",
dedent(
"""
font-style: italic;
"""
),
),
bold=(
"bold font weight",
dedent(
"""
font-weight: bold;
"""
),
),
underline=(
"underlined text",
dedent(
"""
text-decoration: underline;
"""
),
),
center=(
"horizontally centered text",
dedent(
"""
text-align: center;
"""
),
),
large=(
"large font size",
dedent(
"""
font-size: large;
"""
),
),
spaced=(
"widely spaced between characters",
dedent(
"""
letter-spacing: .2rem;
"""
),
),
margin=(
"in the margin",
dedent(
"""
position: relative;
top: -0.3em;
font-weight: bold;
color: #0000ee;
"""
),
),
above=(
"above the line",
dedent(
"""
position: relative;
top: -0.3em;
"""
),
),
below=(
"below the line",
dedent(
"""
position: relative;
top: 0.3em;
"""
),
),
small_caps=(
"small-caps font variation",
dedent(
"""
font-variant: small-caps;
"""
),
),
sub=(
"as subscript",
dedent(
"""
vertical-align: sub;
font-size: small;
"""
),
),
super=(
"as superscript",
dedent(
"""
vertical-align: super;
font-size: small;
"""
),
),
)
CSS_REND_ALIAS = dict(
italic="italics i",
bold="b",
underline="ul",
spaced="spat",
small_caps="smallcaps sc",
super="sup",
)
KNOWN_RENDS = set()
REND_DESC = {}
REFERENCING = dict(
ptr="target",
ref="target",
rs="ref",
)
def makeCssInfo():
"""Make the CSS info for the style sheet."""
rends = ""
for rend, (description, css) in sorted(CSS_REND.items()):
aliases = CSS_REND_ALIAS.get(rend, "")
aliases = sorted(set(aliases.split()) | {rend})
for alias in aliases:
KNOWN_RENDS.add(alias)
REND_DESC[alias] = description
selector = ",".join(f".r_{alias}" for alias in aliases)
contribution = f"\n{selector} {{{css}}}\n"
rends += contribution
return rends
def getRefs(tag, atts, xmlFile):
refAtt = REFERENCING.get(tag, None)
result = []
if refAtt is not None:
refVal = atts.get(refAtt, None)
if refVal is not None and not refVal.startswith("http"):
for refv in refVal.split():
parts = refv.split("#", 1)
if len(parts) == 1:
targetFile = refv
targetId = ""
else:
(targetFile, targetId) = parts
if targetFile == "":
targetFile = xmlFile
result.append((refAtt, targetFile, targetId))
return result
class TEI(CheckImport):
def __init__(
self,
tei=PARAMS["tei"][1],
tf=PARAMS["tf"][1],
validate=PARAMS["validate"][1],
verbose=FLAGS["verbose"][1],
):
"""Converts TEI to TF.
For documentation of the resulting encoding, read the
[transcription template](https://github.com/annotation/text-fabric/blob/master/tf/convert/app/transcription.md).
Below we describe how to control the conversion machinery.
We adopt a fair bit of "convention over configuration" here, in order to lessen
the burden for the user of specifying so many details.
Based on current directory from where the script is called,
it defines all the ingredients to carry out
a `tf.convert.walker` conversion of the TEI input.
This function is assumed to work in the context of a repository,
i.e. a directory on your computer relative to which the input directory exists,
and various output directories: `tf`, `app`, `docs`.
Your current directory must be at
```
~/backend/org/repo/relative
```
where
* `~` is your home directory;
* `backend` is an online back-end name,
like `github`, `gitlab`, `git.huc.knaw.nl`;
* `org` is an organization, person, or group in the back-end;
* `repo` is a repository in the `org`.
* `relative` is a directory path within the repo (0 or more components)
This is only about the directory structure on your local computer;
it is not required that you have online incarnations of your repository
in that back-end.
Even your local repository does not have to be a git repository.
The only thing that matters is that the full path to your repo can be parsed
as a sequence of `home/backend/org/repo/relative`.
Relative to this directory the program expects and creates
input / output directories.
## Input directories
### `tei`
*Location of the TEI-XML sources.*
**If it does not exist, the program aborts with an error.**
Several levels of subdirectories are assumed:
1. the version of the source (this could be a date string).
1. volumes / collections of documents. The subdirectory `__ignore__` is ignored.
1. the TEI documents themselves, conforming to the TEI schema or some
customization of it.
### `schema`
*TEI or other XML schemas against which the sources can be validated.*
They should be XSD or RNG files.
!!! note "Multiple XSD files"
When you started with a RNG file and used `tf.tools.xmlschema` to
convert it to XSD, you may have got multiple XSD files.
One of them has the same base name as the original RNG file,
and you should pass that name. It will import the remaining XSD files,
so do not throw them away.
We use these files as custom TEI schemas,
but to be sure, we still analyse the full TEI schema and
use the schemas here as a set of overriding element definitions.
## Output directories
### `report`
Directory to write the results of the `check` task to: an inventory
of elements / attributes encountered, and possible validation errors.
If the directory does not exist, it will be created.
The default value is `.` (i.e. the current directory in which
the script is invoked).
### `tf`
The directory under which the TF output file (with extension `.tf`)
are placed.
If it does not exist, it will be created.
The TF files will be generated in a folder named by a version number,
passed as `tfVersion`.
### `app` and `docs`
Location of additional TF app configuration and documentation files.
If they do not exist, they will be created with some sensible default
settings and generated documentation.
These settings can be overridden in the `app/config_custom.yaml` file.
Also a default `display.css` file and a logo are added.
Custom content for these files can be provided in files
with `_custom` appended to their base name.
### `docs`
Location of additional documentation.
This can be generated or hand-written material, or a mixture of the two.
Parameters
----------
tei: string, optional ""
If empty, use the latest version under the `tei` directory with sources.
Otherwise it should be a valid integer, and it is the index in the
sorted list of versions there.
* `0` or `latest`: latest version;
* `-1`, `-2`, ... : previous version, version before previous, ...;
* `1`, `2`, ...: first version, second version, ....
* everything else that is not a number is an explicit version
If the value cannot be parsed as an integer, it is used as the exact
version name.
tf: string, optional ""
If empty, the TF version used will be the latest one under the `tf`
directory. If the parameter `prelim` was used in the initialization of
the TEI object, only versions ending in `pre` will be taken into account.
If it can be parsed as the integers 1, 2, or 3 it will bump the latest
relevant TF version:
* `0` or `latest`: overwrite the latest version
* `1` will bump the major version
* `2` will bump the intermediate version
* `3` will bump the minor version
* everything else is an explicit version
Otherwise, the value is taken as the exact version name.
verbose: integer, optional -1
Produce no (-1), some (0) or many (1) progress and reporting messages
"""
super().__init__("lxml")
if self.importOK(hint=True):
self.etree = self.importGet()
else:
return
self.good = True
(backend, org, repo, relative) = getLocation()
if any(s is None for s in (backend, org, repo, relative)):
console(
(
"Not working in a repo: "
f"backend={backend} org={org} repo={repo} relative={relative}"
),
error=True,
)
self.good = False
return
if verbose == 1:
console(
f"Working in repository {org}/{repo}{relative} in back-end {backend}"
)
base = ex(f"~/{backend}")
repoDir = f"{base}/{org}/{repo}"
refDir = f"{repoDir}{relative}"
programDir = f"{refDir}/programs"
schemaDir = f"{refDir}/schema"
convertSpec = f"{programDir}/tei.yaml"
convertCustom = f"{programDir}/tei.py"
self.schemaDir = schemaDir
settings = readYaml(asFile=convertSpec, plain=True)
customKeys = set(
"""
transform
beforeTag
beforeChildren
afterChildren
afterTag
""".strip().split()
)
functionType = type(lambda x: x)
if fileExists(convertCustom):
hooked = []
try:
spec = util.spec_from_file_location("teicustom", convertCustom)
code = util.module_from_spec(spec)
sys.path.insert(0, dirNm(convertCustom))
spec.loader.exec_module(code)
sys.path.pop(0)
for method in customKeys:
if not hasattr(code, method):
continue
func = getattr(code, method)
typeFunc = type(func)
if typeFunc is not functionType:
console(
(
f"custom member {method} should be a function, "
f"but it is a {typeFunc.__name__}"
),
error=True,
)
continue
methodC = f"{method}Custom"