In [1]:
import glob,os,re
import xml.etree.ElementTree as ET
import collections
from tf.fabric import Fabric
from tf.timestamp import Timestamp

In [2]:
tm = Timestamp()
TF = Fabric('~/github/text-fabric-data/greek/sblgnt')

This is Text-Fabric 2.0.0
Api reference : https://github.com/ETCBC/text-fabric/wiki/Api
Tutorial      : https://github.com/ETCBC/text-fabric/blob/master/docs/tutorial.ipynb
Data sources  : https://github.com/ETCBC/text-fabric-data
Data docs     : https://etcbc.github.io/text-fabric-data/features/hebrew/etcbc4c/0_overview.html
Shebanq docs  : https://shebanq.ancient-data.org/text
Slack team    : https://shebanq.slack.com/signup
Questions? Ask shebanq@ancient-data.org for an invite to Slack
0 features found and 0 ignored


  0.00s Grid feature "otype" not found in

  0.00s Grid feature "oslots" not found in



  0.01s Grid feature "otext" not found. Working without Text-API



In [3]:
DIR_PATH = '~/github/greek-new-testament/syntax-trees/sblgnt/xml/'.replace(
    '~', os.path.expanduser('~').replace('\\', '/'),
)

otypeFromCat = dict(
    np='phrase',
    CL='clause',
    vp='phrase',
    noun='word',
    verb='word',
    V='clause_atom',
    det='word',
    ADV='clause_atom',
    S='clause_atom',
    conj='conjunction',
    pron='word',
    pp='phrase',
    prep='word',
    O='clause_atom',
    adjp='phrase',
    adj='word',
    advp='phrase',
    adv='word',
    P='clause_atom',
    IO='clause_atom',
    VC='clause_atom',
    ptcl='word',
    nump='phrase',
    num='word',
    intj='word',
    O2='clause_atom',
)

numberFeatures = set('''
    chapter
    End
    Head
    Start
    verse
    chapter
    booknum
'''.strip().split())

In [4]:
class Data:
    def __init__(self):
        self.tfFromXml = {}
        self.xmlFromTf = {}
        self.nodeNum = 1
        self.maxSlot = 0
        self.maxNode = 0
        self.paths = {}
        self.nodeFeatures = collections.defaultdict(dict)
        self.edgeFeatures = collections.defaultdict(dict)

ignoreAtts = {'Cat'}

books = {}
chapters = {}
verses = {}

def walkNode(node, path):
    if node.tag == 'Node':
        n = data.nodeNum
        xmlId = node.attrib['nodeId']
        data.tfFromXml[xmlId] = n
        data.xmlFromTf[n] = xmlId
        cat = node.attrib['Cat']
        if len(node) == 0:
            data.nodeFeatures['otype'][n] = 'word'
            data.nodeFeatures['psp'][n] = cat
            for parent in path:
                data.edgeFeatures['oslots'].setdefault(parent, []).append(n)
            book = int(xmlId[0:2])
            chapter = int(xmlId[3:5])
            verse = int(xmlId[5:8])
            books.setdefault(book, []).append(n)
            chapters.setdefault((book, chapter), []).append(n)
            verses.setdefault((book, chapter, verse), []).append(n)
        else:
            if len(path) == 0:
                otype = 'sentence'
            else:
                otype = otypeFromCat[cat]
            if otype == 'word':
                otype = 'wordx'
            data.nodeFeatures['otype'][n] = otype
            if otype not in {'clause', 'sentence'}:
                data.nodeFeatures['function'][n] = cat
        for (att, val) in node.attrib.items():
            data.nodeFeatures[att][n] = val
        
        if len(path) != 0:
            parent = path[-1]
            data.edgeFeatures['child'].setdefault(parent, []).append(n)
        data.paths[n] = path
        newPath = path+(n,)

        data.nodeNum += 1
    else:
        newPath = path
    for child in node:
        walkNode(child, newPath)

def getNode(root):
    walkNode(root, ())
    data.maxNode = data.nodeNum - 1
        
def reorder():
    otypeValues = set(data.nodeFeatures['otype'].values())
    otypeRank = dict(((val, ' ' if val == 'word' else val) for val in otypeValues))
    newIds = sorted(range(1, data.maxNode + 1), key=lambda n: (otypeRank[data.nodeFeatures['otype'][n]], n))
    mapping = dict(((v, i+1) for (i, v) in enumerate(newIds)))
    
    orderedFeatures = {}
    for (name, dat) in data.nodeFeatures.items():
        orderedFeatures[name] = dict(((mapping[n], v) for (n, v) in dat.items()))
    data.nodeFeatures = orderedFeatures

    orderedFeatures = {}
    for (name, dat) in data.edgeFeatures.items():
        orderedFeatures[name] = dict(((mapping[n], [mapping[m] for m in v]) for (n, v) in dat.items()))
    data.edgeFeatures = orderedFeatures

def sections():
    n = data.maxNode
    data.nodeFeatures['book'] = {}
    data.nodeFeatures['num'] = {}
    for book in books:
        n += 1
        data.nodeFeatures['otype'][n] = 'book'
        data.nodeFeatures['book'][n] = bookNames[book]
        data.nodeFeatures['booknum'][n] = str(book - HEBREW_BOOKS)
        data.edgeFeatures['oslots'][n] = books[book]
    for (book, chapter) in chapters:
        n += 1
        data.nodeFeatures['otype'][n] = 'chapter'
        data.nodeFeatures['chapter'][n] = str(chapter)
        data.edgeFeatures['oslots'][n] = chapters[(book, chapter)]
    for (book, chapter, verse) in verses:
        n += 1
        data.nodeFeatures['otype'][n] = 'verse'
        data.nodeFeatures['verse'][n] = str(verse)
        data.edgeFeatures['oslots'][n] = verses[(book, chapter, verse)]


    data.maxNode = n

In [5]:
HEBREW_BOOKS = 39

bookNames = {}

nodes_with_ID = collections.OrderedDict()
nodes_without_ID = []

filenamepat = re.compile('^([0-9]{2})-(.*)$')

data = Data()

tm.indent(reset=True)
tm.info('Scanning XML sources of all books')
for xmlfile in glob.glob(DIR_PATH+'*.xml'):
    tm.indent(level=1, reset=True)
    (dirName, baseName) = os.path.split(xmlfile)
    (fileName, extension) = os.path.splitext(baseName)
    match = filenamepat.findall(fileName)
    if len(match) == 0: continue
    (numeral, bookName) = match[0]
    numeral = int(numeral) + HEBREW_BOOKS
    bookNames[numeral] = bookName
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    getNode(root)
    tm.info(bookName)
tm.indent(level=0)
tm.info('Processing data ...')
sections()
reorder()
tm.info('Done')

  0.00s Scanning XML sources of all books
   |     0.86s matthew
   |     0.57s mark
   |     0.99s luke
   |     0.88s john
   |     0.98s acts
   |     0.45s romans
   |     0.28s 1corinthians
   |     0.34s 2corinthians
   |     0.10s galatians
   |     0.10s ephesians
   |     0.07s philippians
   |     0.07s colossians
   |     0.22s 1thessalonians
   |     0.08s 2thessalonians
   |     0.07s 1timothy
   |     0.14s 2timothy
   |     0.03s titus
   |     0.02s philemon
   |     0.29s hebrews
   |     0.74s james
   |     0.07s 1peter
   |     0.05s 2peter
   |     0.09s 1john
   |     0.02s 2john
   |     0.02s 3john
   |     0.03s jude
   |     0.62s revelation
  8.21s Processing data ...
    11s Done


In [6]:
otypeValues = set(data.nodeFeatures['otype'].values())
otypeRank = dict(((val, ' ' if val == 'word' else val) for val in otypeValues))

In [8]:
metaData = {
    '': dict(
        createdBy='Cody Kingham and Dirk Roorda',
    ),
    'otext': {
        'sectionFeatures': 'book,num,num',
        'sectionTypes': 'book,chapter,verse',
        'fmt:text-orig-full': '{Unicode} ',
        'fmt:lex-orig-full': '{UnicodeLemma} ',
    },
}
for nf in data.nodeFeatures:
    metaData.setdefault(nf, {})['valueType'] = 'int' if nf in numberFeatures else 'str'
for ef in data.edgeFeatures:
    metaData.setdefault(ef, {})['valueType'] = 'int' if ef in numberFeatures else 'str'

TF.save(nodeFeatures=data.nodeFeatures, edgeFeatures=data.edgeFeatures, metaData=metaData)

  0.00s Exporting 28 node and 2 edge and 1 config features to /Users/dirk/github/text-fabric-data/greek/sblgnt:
   |     0.19s T Case                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.78s T Cat                  to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.01s T ClType               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.00s T Degree               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.72s T End                  to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.15s T Gender               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.09s T HasDet               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.47s T Head                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.08s T Mood                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.17s T Number               to /Users/dirk/github/text-fabric-data/g