# Reading node feature in Julia

We read the BHSA feature `g_word_utf8`, which maps nearly half a million integers to Hebrew word occurrences
in the Hebrew Bible.

We measure the execution time of a second run of the last cell, so that we do not count warming up effects.

In [5]:
function error(msg)
    write(STDERR, "$msg\n")
end

error (generic function with 1 method)

In [6]:
function valueFromTf(tf)
  join([replace(replace(x, "\\t", "\t"), "\\n", "\n") for x in split(tf, "\\\\")], "\\")
end

valueFromTf (generic function with 1 method)

In [7]:
function setFromSpec(spec)
  covered = Set()
  for r_str in split(spec, ",")
    bounds = split(r_str, "-")
    if length(bounds) == 1
      push!(covered, parse(Int, r_str))
    else
      b = parse(Int, bounds[1])
      e = parse(Int, bounds[2])
      if e < b
        (b, e) = (e, b)
      end
      for n in b:e
        push!(covered, n)
      end
    end
  end
  covered
end

setFromSpec (generic function with 1 method)

In [8]:
function readTf(path)
  if !isfile(path)
    error("TF reading: feature file '$path' does not exist")
    return false
  end
  fh = open(path)
  i = 0
  for line in eachline(fh)
    i += 1
    text = rstrip(line)
    if startswith(text, "@")
      continue
    else
      if text != ""
        error("Line $i: missing blank line after metadata")
        close(fh)
        return false
      else
        break
      end
    end
  end
  result = readDataTf(fh, i)
  close(fh)
  result
end

readTf (generic function with 1 method)

In [9]:
function readDataTf(fh, firstI)
  i = firstI
  implicit_node = 1
  data = Dict()
  normFields = 2
  isNum = false
  errors = 0
  for line in eachline(fh)
    i += 1
    fields = split(rstrip(line, '\n'), "\t")
    lfields = length(fields)
    if lfields > normFields
      error("$(i) : wrongFields")
      errors += 1
      continue
    end
    if lfields == normFields
      nodes = setFromSpec(fields[1])
      valTf = fields[end]
    else
      nodes = Set([implicit_node])
      if lfields == 1
        valTf = fields[1]
      else
        valTf = ""
      end
    end
    implicit_node = maximum(nodes) + 1
    value = (
        valTf == "" ?
          (isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
          valueFromTf(valTf)
    )
    for n in nodes
      if value !== nothing
        data[n] = value
      end
    end
  end
  (errors, data)
end

readDataTf (generic function with 1 method)

In [11]:
base = "$(homedir())/text-fabric-data/etcbc/bhsa/tf/c"
feature = "g_word_utf8"
featurePath = "$base/$feature.tf"
(errors, data) = readTf(featurePath)
if errors == 0
    print("$(length(data))\n")
    print(data[2])
else
    print("no results")
end

426584
רֵאשִׁ֖ית

Execution time: around 3.5s