# From LAF-Fabric to Text-Fabric

This notebook turns the ETCBC4C dataset from LAF-Fabric into Text-Fabric

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from laf.fabric import LafFabric
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.8.3
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html



# Core etcbc data

This is what came out of the live Emdros database running on the jakob server at the ETCBC, on 2016-11-04.

In [3]:
nodeFeatures = '''
otype
code
det
dist
dist_unit
domain
function
g_cons
g_cons_utf8
g_lex
g_lex_utf8
g_nme
g_nme_utf8
g_pfm
g_pfm_utf8
g_prs
g_prs_utf8
g_uvf
g_uvf_utf8
g_vbe
g_vbe_utf8
g_vbs
g_vbs_utf8
g_voc_lex
g_voc_lex_utf8
g_word
g_word_utf8
gn
is_root
kind
language
lex
lex_utf8
ls
mother_object_type
nme
nu
number
pdp
pfm
prs
prs_gn
prs_nu
prs_ps
ps
qere
qere_utf8
rela
sp
st
tab
trailer
trailer_utf8
txt
typ
uvf
vbe
vbs
vs
vt
book
chapter
label
verse
'''

edgeFeatures = '''
oslots
mother
functional_parent
distributional_parent
'''

nodeFeatureList = nodeFeatures.strip().split()
edgeFeatureList = edgeFeatures.strip().split()

lfNodeFeatures = ' '.join(nodeFeatureList+['monads'])
lfEdgeFeatures = ' '.join(set(edgeFeatureList) - {'oslots'})

In [4]:
API = fabric.load('etcbc4c', '--', 'TF', {
    "xmlids": {"node": False, "edge": False},
    "features": (lfNodeFeatures, lfEdgeFeatures),
    "primary": False,
}, verbose='DETAIL')
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.00s DETAIL: COMPILING m: etcbc4c: UP TO DATE
  0.00s USING main: etcbc4c DATA COMPILED AT: 2016-11-09T19-16-37
  0.01s DETAIL: load main: G.node_anchor_min
  0.10s DETAIL: load main: G.node_anchor_max
  0.19s DETAIL: load main: G.node_sort
  0.24s DETAIL: load main: G.node_sort_inv
  0.64s DETAIL: load main: G.edges_from
  0.70s DETAIL: load main: G.edges_to
  0.76s DETAIL: load main: F.etcbc4_db_monads [node] 
  1.43s DETAIL: load main: F.etcbc4_db_otype [node] 
  2.01s DETAIL: load main: F.etcbc4_ft_code [node] 
  2.06s DETAIL: load main: F.etcbc4_ft_det [node] 
  2.27s DETAIL: load main: F.etcbc4_ft_dist [node] 
  2.46s DETAIL: load main: F.etcbc4_ft_dist_unit [node] 
  2.70s DETAIL: load main: F.etcbc4_ft_domain [node] 
  2.73s DETAIL: load main: F.etcbc4_ft_function [node] 
  2.84s DETAIL: load main: F.etcbc4_ft_g_cons [node] 
  3.02s DETAIL: load main: F.etcbc4_ft_g_cons_utf8 [node] 
  3.27s DETAIL: load main: F.etcbc4_ft_g_lex [node] 
  

In [10]:
import sys
from tf.fabric import Fabric
from tf.helpers import *

In [11]:
TFDIR = '/Users/dirk/github/text-fabric-data/hebrew/etcbc4c'
print('{} node features'.format(len(nodeFeatureList)))
print('{} edge features'.format(len(edgeFeatureList)))

64 node features
4 edge features


In [12]:
TF = Fabric(locations=TFDIR)

  0.00s Looking for available data features:
  0.01s   book                 from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book.tf
  0.01s   book@am              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@am.tf
  0.01s   book@ar              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@ar.tf
  0.01s   book@bn              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@bn.tf
  0.01s   book@da              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@da.tf
  0.01s   book@de              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@de.tf
  0.01s   book@el              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@el.tf
  0.01s   book@en              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@en.tf
  0.02s   book@es              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c/book@es.tf
  0.02s   book@fa              from /Users/dirk/github/text-fabric-data/hebr

## Preparations

### Map slot numbers
In TF we make sure that the slots go from 0-maxSlot consecutively.
So we have to map the original LAF-Fabric monad numbers
to the node numbers of the words in TF.

In [14]:
slotsFromMonad = {}
for w in F.otype.s('word'):
    m = int(F.monads.v(w))
    slotsFromMonad[m] = w

In [15]:
def slotsFromMonadList(mList): return {slotsFromMonad.get(m, m) for m in mList}

## oslots

Here is code to write the oslots aedge information in a compact text file.

In [16]:
oslotsData = {}
maxSlot = max(n for n in F.otype.s('word'))
maxNode = max(n for n in NN())
print('max slot = {:>7}\nmax node = {:>7}'.format(maxSlot, maxNode))

for n in range(maxSlot+1, maxNode+1):
    oslotsData[n] = slotsFromMonadList(setFromSpec(F.monads.v(n)))

max slot =  426580
max node = 1436893


In [17]:
nodeFeatures=dict(((feat, F.item[feat].lookup) for feat in nodeFeatureList))

In [18]:
edgeFeatures = {}
for ef in edgeFeatureList:    
    if ef == 'oslots':
        data = oslotsData
    else:
        data = dict(((n, set(nDict.keys())) for (n, nDict) in C.item[ef].lookup.items()))
    edgeFeatures[ef] = data

In [19]:
TF.save(
    nodeFeatures=nodeFeatures,
    edgeFeatures=edgeFeatures,
    metaData={'': dict(source='ETCBC4c via LAF-Fabric')},
)

    24s Exporting 64 node and 4 edge features to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c:
   |     0.07s T book                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.05s T chapter              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.15s T code                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.88s T det                  to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     1.09s T dist                 to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     1.02s T dist_unit            to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.20s T domain               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.43s T function             to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.90s T g_cons               to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.84s T g_cons_utf8          to /Users/dirk/github/text-fabric

# Additional data

## Book names international

For each language in which book names have been translated, we add a feature `book_ll` where 
`ll` is the two letter language code.
The feature gives for each book node the name of that book in that language.

In [13]:
from etcbc.blang import booklangs, booknames

In [15]:
langs = booklangs['Hebrew']
names = booknames['Hebrew']
books = [b for b in F.otype.s('book')]
bookIndex = dict(((b,i) for (i,b) in enumerate(books)))

metaData = {}
nodeFeatures = {}
for (code, (langEng, langOwn)) in sorted(booklangs['Hebrew'].items()):
    fName = 'book@{}'.format(code)
    print(fName)
    metaData[fName] = dict(
        source='blang.py in LAF-Fabric',
        languageCode=code,
        languageEnglish=langEng,
        language=langOwn
    )
    nodeFeatures[fName] = dict(((b, names[code][bookIndex[b]]) for b in books))

TF.save(nodeFeatures=nodeFeatures, metaData=metaData)

book@am
book@ar
book@bn
book@da
book@de
book@el
book@en
book@es
book@fa
book@fr
book@he
book@hi
book@id
book@ja
book@ko
book@la
book@nl
book@pa
book@pt
book@ru
book@sw
book@syc
book@tr
book@ur
book@yo
book@zh
  0.00s Exporting 26 node and 0 edge features to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c:
   |     0.00s T book@am              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@ar              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@bn              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@da              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@de              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@el              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@en              to /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s T book@es              to /User