In [1]:
import glob,os,re
import xml.etree.ElementTree as ET
import collections
from tf.fabric import Fabric
from tf.timestamp import Timestamp

In [2]:
tm = Timestamp()
TF = Fabric('~/github/text-fabric-data/greek/sblgnt')

This is Text-Fabric 2.2.1
Api reference : https://github.com/ETCBC/text-fabric/wiki/Api
Tutorial      : https://github.com/ETCBC/text-fabric/blob/master/docs/tutorial.ipynb
Data sources  : https://github.com/ETCBC/text-fabric-data
Data docs     : https://etcbc.github.io/text-fabric-data
Shebanq docs  : https://shebanq.ancient-data.org/text
Slack team    : https://shebanq.slack.com/signup
Questions? Ask shebanq@ancient-data.org for an invite to Slack
0 features found and 0 ignored


  0.01s Grid feature "otype" not found in

  0.01s Grid feature "oslots" not found in



  0.01s Grid feature "otext" not found. Working without Text-API



In [24]:
DIR_PATH = '~/github/greek-new-testament/syntax-trees/sblgnt/xml/'.replace(
    '~', os.path.expanduser('~').replace('\\', '/'),
)

otypeFromCat = dict(
    np='phrase',
    CL='clause',
    vp='phrase',
    noun='word',
    verb='word',
    V='clause_atom',
    det='word',
    ADV='clause_atom',
    S='clause_atom',
    conj='conjunction',
    pron='word',
    pp='phrase',
    prep='word',
    O='clause_atom',
    adjp='phrase',
    adj='word',
    advp='phrase',
    adv='word',
    P='clause_atom',
    IO='clause_atom',
    VC='clause_atom',
    ptcl='word',
    nump='phrase',
    num='word',
    intj='word',
    O2='clause_atom',
)

numberFeatures = set('''
    chapter
    End
    Head
    Start
    verse
    chapter
    booknum
'''.strip().split())

In [25]:
langHead = re.compile('\[([^\]]+)\]')
bookNames = collections.defaultdict(lambda: [])
bookLangs = {}
with open('blang.txt') as fb:
    specsDone = False
    for line in fb:
        line = line.rstrip('\n')
        if line == '---':
            specsDone = True
            curLang = None
            continue
        if specsDone:
            match = langHead.findall(line)
            if match:
                curLang = match[0]
            else:
                bookNames[curLang].append(line)
        else:
            (acro, langEn, lang) = line.split('=', 2)
            bookLangs[acro] = (langEn, lang)

In [26]:
class Data:
    def __init__(self):
        self.tfFromXml = {}
        self.xmlFromTf = {}
        self.nodeNum = 1
        self.maxSlot = 0
        self.maxNode = 0
        self.paths = {}
        self.nodeFeatures = collections.defaultdict(dict)
        self.edgeFeatures = collections.defaultdict(dict)

ignoreAtts = {'Cat'}

books = {}
chapters = {}
verses = {}

def walkNode(node, path):
    if node.tag == 'Node':
        n = data.nodeNum
        xmlId = node.attrib['nodeId']
        data.tfFromXml[xmlId] = n
        data.xmlFromTf[n] = xmlId
        cat = node.attrib['Cat']
        if len(node) == 0:
            data.nodeFeatures['otype'][n] = 'word'
            data.nodeFeatures['psp'][n] = cat
            for parent in path:
                data.edgeFeatures['oslots'].setdefault(parent, []).append(n)
            book = int(xmlId[0:2])
            chapter = int(xmlId[3:5])
            verse = int(xmlId[5:8])
            books.setdefault(book, []).append(n)
            chapters.setdefault((book, chapter), []).append(n)
            verses.setdefault((book, chapter, verse), []).append(n)
        else:
            if len(path) == 0:
                otype = 'sentence'
            else:
                otype = otypeFromCat[cat]
            if otype == 'word':
                otype = 'wordx'
            data.nodeFeatures['otype'][n] = otype
            if otype not in {'clause', 'sentence'}:
                data.nodeFeatures['function'][n] = cat
        for (att, val) in node.attrib.items():
            data.nodeFeatures[att][n] = val
        
        if len(path) != 0:
            parent = path[-1]
            data.edgeFeatures['child'].setdefault(parent, []).append(n)
        data.paths[n] = path
        newPath = path+(n,)

        data.nodeNum += 1
    else:
        newPath = path
    for child in node:
        walkNode(child, newPath)

def getNode(root):
    walkNode(root, ())
    data.maxNode = data.nodeNum - 1
        
def reorder():
    otypeValues = set(data.nodeFeatures['otype'].values())
    otypeRank = dict(((val, ' ' if val == 'word' else val) for val in otypeValues))
    newIds = sorted(range(1, data.maxNode + 1), key=lambda n: (otypeRank[data.nodeFeatures['otype'][n]], n))
    mapping = dict(((v, i+1) for (i, v) in enumerate(newIds)))
    
    orderedFeatures = {}
    for (name, dat) in data.nodeFeatures.items():
        orderedFeatures[name] = dict(((mapping[n], v) for (n, v) in dat.items()))
    data.nodeFeatures = orderedFeatures

    orderedFeatures = {}
    for (name, dat) in data.edgeFeatures.items():
        orderedFeatures[name] = dict(((mapping[n], [mapping[m] for m in v]) for (n, v) in dat.items()))
    data.edgeFeatures = orderedFeatures

def sections():
    n = data.maxNode
    data.nodeFeatures['book'] = {}
    for (i, book) in enumerate(books):
        n += 1
        data.nodeFeatures['otype'][n] = 'book'
        data.nodeFeatures['book'][n] = bookNamesOrig[book]
        for ll in bookNames:
            data.nodeFeatures['book@{}'.format(ll)][n] = bookNames[ll][i]
        data.nodeFeatures['booknum'][n] = str(book - HEBREW_BOOKS)
        data.edgeFeatures['oslots'][n] = books[book]
    for (book, chapter) in chapters:
        n += 1
        data.nodeFeatures['otype'][n] = 'chapter'
        data.nodeFeatures['chapter'][n] = str(chapter)
        data.edgeFeatures['oslots'][n] = chapters[(book, chapter)]
    for (book, chapter, verse) in verses:
        n += 1
        data.nodeFeatures['otype'][n] = 'verse'
        data.nodeFeatures['verse'][n] = str(verse)
        data.edgeFeatures['oslots'][n] = verses[(book, chapter, verse)]


    data.maxNode = n

In [27]:
HEBREW_BOOKS = 39

bookNamesOrig = {}

nodes_with_ID = collections.OrderedDict()
nodes_without_ID = []

filenamepat = re.compile('^([0-9]{2})-(.*)$')

data = Data()

tm.indent(reset=True)
tm.info('Scanning XML sources of all books')
for xmlfile in glob.glob(DIR_PATH+'*.xml'):
    tm.indent(level=1, reset=True)
    (dirName, baseName) = os.path.split(xmlfile)
    (fileName, extension) = os.path.splitext(baseName)
    match = filenamepat.findall(fileName)
    if len(match) == 0: continue
    (numeral, bookName) = match[0]
    numeral = int(numeral) + HEBREW_BOOKS
    bookNamesOrig[numeral] = bookName
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    getNode(root)
    tm.info(bookName)
tm.indent(level=0)
tm.info('Processing data ...')
sections()
reorder()
tm.info('Done')

  0.00s Scanning XML sources of all books
   |     1.01s matthew
   |     0.70s mark
   |     1.14s luke
   |     0.93s john
   |     1.10s acts
   |     0.52s romans
   |     0.27s 1corinthians
   |     0.42s 2corinthians
   |     0.11s galatians
   |     0.10s ephesians
   |     0.07s philippians
   |     0.07s colossians
   |     0.30s 1thessalonians
   |     0.04s 2thessalonians
   |     0.07s 1timothy
   |     0.15s 2timothy
   |     0.03s titus
   |     0.02s philemon
   |     0.23s hebrews
   |     0.32s james
   |     0.07s 1peter
   |     0.05s 2peter
   |     0.09s 1john
   |     0.02s 2john
   |     0.02s 3john
   |     0.03s jude
   |     0.70s revelation
  8.64s Processing data ...
    12s Done


In [28]:
metaData = {
    '': dict(
        createdBy='Cody Kingham and Dirk Roorda',
    ),
    'otext': {
        'sectionFeatures': 'book,chapter,verse',
        'sectionTypes': 'book,chapter,verse',
        'fmt:text-orig-full': '{Unicode} ',
        'fmt:lex-orig-full': '{UnicodeLemma} ',
    },
    'book@en': {
        'valueType': 'str',
        'language': 'English',
        'languageCode': 'en',
        'languageEnglish': 'english',
    },
}
for ll in bookNames:
    metaData['book@{}'.format(ll)] = {
        'valueType': 'str',
        'language': bookLangs[ll][1],
        'languageCode': ll,
        'languageEnglish': bookLangs[ll][0],
    }

## Statistical features
We add some statistical features.

In [29]:
tm.info('Computing statistics')
wstats = {
    'freq': {
        'lex': collections.Counter(),
        'occ': collections.Counter(),
    },
    'rank': {
        'lex': {},
        'occ': {},
    },
}

nodeFeatures = data.nodeFeatures

words = [n[0] for n in nodeFeatures['otype'].items() if n[1] == 'word']

for w in words:
    occ = nodeFeatures['Unicode'][w]
    lex = nodeFeatures['UnicodeLemma'][w]
    wstats['freq']['lex'][lex] += 1
    wstats['freq']['occ'][occ] += 1
for tp in ['lex', 'occ']:
    rank = -1
    prev_n = -1
    amount = 1
    for (x, n) in sorted(wstats['freq'][tp].items(), key=lambda y: (-y[1], y[0])):
        if n == prev_n:
            amount += 1
        else:
            rank += amount
            amount = 1
        prev_n = n
        wstats['rank'][tp][x] = rank
tm.info('Done')

tm.info('Adding statistics as features')
occFeatures = {}
for tp in ['occ', 'lex']:
    for ft in ('freq_{}'.format(tp), 'rank_{}'.format(tp)):
        occFeatures[ft] = {}
        metaData.setdefault(ft, {})['valueType'] = 'int'

for w in words:
    occ = nodeFeatures['Unicode'][w]
    lex = nodeFeatures['UnicodeLemma'][w]
    for tp in ['occ', 'lex']:
        ref = occ if tp == 'occ' else lex
        for kn in ['freq', 'rank']:
            ft = '{}_{}'.format(kn, tp)
            occFeatures[ft][w] = str(wstats[kn][tp][ref])

nodeFeatures.update(occFeatures)

tm.info('Done')

    16s Computing statistics
    17s Done
    17s Adding statistics as features
    18s Done


In [30]:
otypeValues = set(data.nodeFeatures['otype'].values())
otypeRank = dict(((val, ' ' if val == 'word' else val) for val in otypeValues))

In [31]:
for nf in data.nodeFeatures:
    metaData.setdefault(nf, {})['valueType'] = 'int' if nf in numberFeatures else 'str'
for ef in data.edgeFeatures:
    metaData.setdefault(ef, {})['valueType'] = 'int' if ef in numberFeatures else 'str'

TF.save(nodeFeatures=data.nodeFeatures, edgeFeatures=data.edgeFeatures, metaData=metaData)

  0.00s Exporting 57 node and 2 edge and 1 config features to /Users/dirk/github/text-fabric-data/greek/sblgnt:
   |     0.20s T Case                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.74s T Cat                  to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.01s T ClType               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.00s T Degree               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.74s T End                  to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.14s T Gender               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.09s T HasDet               to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.57s T Head                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.12s T Mood                 to /Users/dirk/github/text-fabric-data/greek/sblgnt
   |     0.24s T Number               to /Users/dirk/github/text-fabric-data/g