In [5]:
import os, re, collections
from glob import glob

from tf.fabric import Fabric
from tf.timestamp import Timestamp

In [6]:
REPO = os.path.expanduser('~/github/sanskrit_text_dcs')
TEXT_DIR = '{}/corpora'.format(REPO)

In [7]:
tm = Timestamp()
TF = Fabric('~/github/text-fabric-data/sanskrit/dcs')

This is Text-Fabric 2.3.7
Api reference : https://github.com/ETCBC/text-fabric/wiki/Api
Tutorial      : https://github.com/ETCBC/text-fabric/blob/master/docs/tutorial.ipynb
Data sources  : https://github.com/ETCBC/text-fabric-data
Data docs     : https://etcbc.github.io/text-fabric-data
Shebanq docs  : https://shebanq.ancient-data.org/text
Slack team    : https://shebanq.slack.com/signup
Questions? Ask shebanq@ancient-data.org for an invite to Slack
0 features found and 0 ignored


  0.01s Grid feature "otype" not found in

  0.01s Grid feature "oslots" not found in



  0.01s Grid feature "otext" not found. Working without Text-API



In [16]:
errors = collections.defaultdict(list)
slotNum = 0
nodeFeatures = collections.defaultdict(dict)
edgeFeatures = collections.defaultdict(dict)

CHAR = 'char'
TRAILER='trailer'
WORD = 'word'
BOOK = 'book'
SECTION = 'chapter'
LINE = 'verse'
nodes = collections.defaultdict(list)

def showErrorSummary():
    errorTexts = sorted(errors.keys())[0:3]
    for errorText in errorTexts:
        for error in errors[errorText][0:3]:
            print(error)

def readCorpus():
    tm.indent(reset=True)
    tm.info('Reading corpus')
    os.chdir(TEXT_DIR)
    errors.clear()
    nodeFeatures.clear()
    edgeFeatures.clear()
    nodes.clear()
    global slotNum
    slotNum = 0
    textFiles = sorted(os.path.splitext(f)[0] for f in glob('*.txt'))
    print('{} texts'.format(len(textFiles)))
    for textFile in textFiles:
        readText(textFile)
    if len(errors):
        print('There were {} errors'.format(sum(len(errors[textFile]) for textFile in errors)))
        showErrorSummary()
    else:
        print('No errors')
    print('''
{} slots
{} words in source
{} lines
{} sections
{} books
'''.format(
        slotNum,
        len(nodes[WORD]),
        len(nodes[LINE]),
        len(nodes[SECTION]),
        len(nodes[BOOK]),
    ))
    tm.info('Done')

In [17]:
linePat = re.compile('^\s*([^\/]*)\/+\s*\(([^.)]+).([^)]+)\)\s*')
emptyLinePat = re.compile('^\s*$')

In [18]:
def readText(textFile):
    global slotNum
    with open('{}.txt'.format(textFile)) as f:
        bookName = textFile
        bookStart = slotNum + 1
        curSection = None
        sectionStart = slotNum + 1
        for (n, line) in enumerate(f):
            lineStart = slotNum + 1
            line = line.rstrip('\n')
            if emptyLinePat.match(line): continue
            match = linePat.match(line)
            if not match:
                errors[textFile].append('{}:{} - unexpected line\n\t{}\n'.format(textFile, n + 1, line))
                continue
            text = match.group(1).rstrip()
            sectionNr = match.group(2)
            if sectionNr == None:
                break
            if curSection != sectionNr:
                if curSection != None:
                    sectionEnd = slotNum
                    nodes[SECTION].append((sectionStart, sectionEnd, {SECTION: curSection, BOOK: bookName}))
                curSection = sectionNr
                sectionStart = slotNum + 1                
            lineNr = match.group(3)
            words = text.split()
            for word in words:
                wordStart = slotNum + 1
                for letter in word:
                    slotNum += 1                 
                    nodeFeatures[CHAR][slotNum] = letter
                    nodeFeatures[TRAILER][slotNum] = ''
                wordEnd = slotNum
                nodeFeatures[TRAILER][slotNum] = ' '
                nodes[WORD].append((wordStart, wordEnd, {WORD: word}))
            lineEnd = slotNum
            nodes[LINE].append((lineStart, lineEnd, {LINE: lineNr, SECTION: curSection, BOOK: bookName}))
        sectionEnd = slotNum
        if curSection == None:
            print('Empty book {}'.format(textFile))
        else:    
            nodes[SECTION].append((sectionStart, sectionEnd, {SECTION: curSection, BOOK: bookName}))
            bookEnd = slotNum
            nodes[BOOK].append((bookStart, bookEnd, {BOOK: bookName}))

## Conversion notes

190 texts

### Empty Texts

Empty book Gřḍhārthaprakāśaka
Empty book Kaulāvalīnirṇaya
Empty book Mṛgendraṭīkā
Empty book Nyāyacandrikāpaṇjikā
Empty book Śārṅgadharasaṃhitādīpikā
Empty book Tantrasaṃgraha
Empty book Tantrāloka

**Action taken**

Skipped them altogether

### Irregular lines

There were 3 errors
Agastīyaratnaparīkṣā:55 - unexpected line
		[... auein Vers / Satzjh] // (27.2)hariśvetaṃ tathā vaṃśe pītaśvetaṃ ca śūkare // (28.1)

Gokarṇapurāṇasāraḥ:185 - unexpected line
		iti śrīskānde gokarṇakhaṇḍe śrīgokarṇamāhātmye sāroddhāre prathamo 'dhyāyaḥ / // (88.1)

Rasādhyāya:130 - unexpected line
		[... auein Vers / Satzjh] // (64.2)tāmrāt sūtaṃ rasāttāmraṃ pātanāya pṛthakkṛtam / (65.1)
        
**Action taken**

Case 1 and 3: inserted a newline, changed the first / into an `~`

Case 2: removed the `//`

In [19]:
readCorpus()

  0.00s Reading corpus
190 texts
Empty book Gřḍhārthaprakāśaka
Empty book Kaulāvalīnirṇaya
Empty book Mṛgendraṭīkā
Empty book Nyāyacandrikāpaṇjikā
Empty book Śārṅgadharasaṃhitādīpikā
Empty book Tantrasaṃgraha
Empty book Tantrāloka
No errors

1161379 slots
136409 words in source
25729 lines
13010 sections
183 books

  0.95s Done


In [24]:
metaData = {
    '': dict(
        createdBy='Tylor Neill and Dirk Roorda',
        name='Sanskrit_Corpus_DCS',
        title='Sanskrit Corpus',
        provenance='[DCS](http://kjc-fs-cluster.kjc.uni-heidelberg.de/dcs/index.php)',
        description='DCS, the Digital Corpus of Sanskrit, is a searchable collection of lemmatized Sanskrit texts. It offers free internet access to a part of the database of the linguistic program SanskritTagger, which has been under constant development since 1999.'
    ),
    'otext': {
        'sectionFeatures': ','.join((BOOK, SECTION, LINE)),
        'sectionTypes': ','.join((BOOK, SECTION, LINE)),
        'fmt:text-orig-full': '{{{}}}'.format(CHAR),
        'fmt:text-orig-segmented': '{{{}}}{{{}}}'.format(CHAR, TRAILER),
    },
    'otype': {
        'valueType': 'str',        
    },
    'oslots': {
        'valueType': 'str',
    },
    'book@sa': {
        'valueType': 'str',
        'language': 'Saṃskṛtam',
        'languageCode': 'sa',
        'languageEnglish': 'sanskrit',
    },
    'trailer': {
        'valueType': 'str',
    }
}
nodeFeatures['book@sa'] = nodeFeatures[BOOK]

for (sectionType) in (CHAR, WORD, LINE, SECTION, BOOK):
    metaData.setdefault(sectionType, {})['valueType'] = 'int' if sectionType in {LINE, SECTION} else 'str'

In [25]:
def computeStatistics():
    tm.info('Computing statistics')
    wstats = {
        'freq': collections.Counter(),
        'rank': {},
    }
    word = {}

    words = [n[0] for n in nodeFeatures['otype'].items() if n[1] == WORD]

    for w in words:
        occ = nodeFeatures[WORD][w]
        wstats['freq'][occ] += 1
    rank = -1
    prev_n = -1
    amount = 1
    for (x, n) in sorted(wstats['freq'].items(), key=lambda y: (-y[1], y[0])):
        if n == prev_n:
            amount += 1
        else:
            rank += amount
            amount = 1
        prev_n = n
        wstats['rank'][x] = rank
    tm.info('Done')

    tm.info('Adding statistics as features')
    occFeatures = {}
    for ft in ('freq', 'rank'):
        occFeatures[ft] = {}
        metaData.setdefault(ft, {})['valueType'] = 'int'

    for w in words:
        occ = nodeFeatures[WORD][w]
        for ft in ['freq', 'rank']:
            occFeatures[ft][w] = str(wstats[ft][occ])

    nodeFeatures.update(occFeatures)
    tm.info('Done')

In [26]:
def makeTextFabric():
    tm.indent(reset=True)
    tm.info('Generating text-fabric dataset')
    nodeFeatures['otype'] = dict((n, 'letter') for n in range(1, slotNum + 1))
    nodeNum = slotNum
    for (nodeType) in (WORD, LINE, SECTION, BOOK):
        for (start, end, feats) in nodes[nodeType]:
            nodeNum += 1
            nodeFeatures['otype'][nodeNum] = nodeType
            for feat in feats:
                nodeFeatures[feat][nodeNum] = feats[feat]
            edgeFeatures['oslots'][nodeNum] = list(range(start, end + 1))
    computeStatistics()
    TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
    tm.info('Done')

In [27]:
makeTextFabric()

  0.00s Generating text-fabric dataset
  0.64s Computing statistics
  1.20s Done
  1.20s Adding statistics as features
  1.43s Done
  0.00s Exporting 10 node and 1 edge and 1 config features to /Users/dirk/github/text-fabric-data/sanskrit/dcs:
   |     0.08s T book                 to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.08s T book@sa              to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.07s T chapter              to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     2.01s T char                 to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.30s T freq                 to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.67s T otype                to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.22s T rank                 to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     1.70s T trailer              to /Users/dirk/github/text-fabric-data/sanskrit/dcs
   |     0.06s T verse              