# Reading node feature in Python

We read the BHSA feature `g_word_utf8`, which maps nearly half a million integers to Hebrew word occurrences
in the Hebrew Bible.

We measure the execution time of a second run of the last cell, so that we do not count warming up effects.

In [2]:
import os
import sys

In [3]:
def error(msg):
    sys.stderr.write(f'{msg}\n')

In [4]:
def valueFromTf(tf):
  return '\\'.join(x.replace('\\t', '\t').replace('\\n', '\n') for x in tf.split('\\\\'))

def setFromSpec(spec):
  covered = set()
  for r_str in spec.split(','):
    bounds = r_str.split('-')
    if len(bounds) == 1:
      covered.add(int(r_str))
    else:
      b = int(bounds[0])
      e = int(bounds[1])
      if (e < b):
        (b, e) = (e, b)
      for n in range(b, e + 1):
        covered.add(n)
  return covered

In [5]:
def readTf(path):
  if not os.path.exists(path):
    error('TF reading: feature file "{}" does not exist'.format(path))
    return False
  fh = open(path, encoding='utf8')
  i = 0
  for line in fh:
    i += 1
    text = line.rstrip()
    if text.startswith('@'):
      continue
    else:
      if text != '':
        error('Line {}: missing blank line after metadata'.format(i))
        fh.close()
        return False
      else:
        break
  result = readDataTf(fh, i)
  fh.close()
  return result

In [6]:
def readDataTf(fh, firstI):
  i = firstI
  implicit_node = 1
  data = {}
  normFields = 2
  isNum = False
  errors = 0
  for line in fh:
    i += 1
    fields = line.rstrip('\n').split('\t')
    lfields = len(fields)
    if lfields > normFields:
      error(f'{i}: wrongFields')
      errors += 1
      continue
    if lfields == normFields:
      nodes = setFromSpec(fields[0])
      valTf = fields[-1]
    else:
      nodes = {implicit_node}
      if lfields == 1:
        valTf = fields[0]
      else:
        valTf = ''
    implicit_node = max(nodes) + 1
    value = (
        int(valTf) if isNum and valTf != '' else None if isNum else ''
        if valTf == '' else valueFromTf(valTf)
    )
    for n in nodes:
      if value is not None:
        data[n] = value
  return not errors and data

In [8]:
base = f'~/text-fabric-data/etcbc/bhsa/tf/c'
feature = 'g_word_utf8'
featurePath = f'{os.path.expanduser(base)}/{feature}.tf'
result = readTf(featurePath)
if result:
    print(len(result))
    print(result[2])
else:
    print('no results')

426584
רֵאשִׁ֖ית


Execution time: arond 1.2s