# Notebook for exploring the Chinese-English Buddhist Dictionary

In [13]:
import re

import pandas as pd
import numpy as np

from curation_util import ExtractFromColophon
from curation_util import ExtractWords
from curation_util import P2englishPN
from curation_util import WriteColophon

# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
cols = ["simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == '佛'][cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3618,3618,佛,\N,fó,Buddha / Awakened One,"Sanskrit: buddha, Pali: buddha, Japanese: buts..."
3618,3619,佛,\N,fó,Buddhist / of Buddhism,As a modifier describing a concept that relate...


In [14]:
# Conversions for a word based on a traditional characters
# Input
luid = 46109
tradArr = ExtractWords(u"十方千五百佛名經")
print "len(tradArr) = %d" % len(tradArr)
sanskrit = u""
english = u""
grammar = "proper noun"
concept = u"\\N\t\\N"
#concept = u"佛\tBuddha"
#concept = u"菩萨\tBodhisattva"
concept = u"经\tSūtra"
#subdomain = u"中国佛教\tChinese Buddhism"
#subdomain = u"\\N\t\\N"
subdomain = u"大乘佛教\tMahāyāna Buddhism"

# Generated
domain = u"佛教\tBuddhism"
#domain = u"古文\tClassical Chinese"
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"

# Do some guessing on the best English name
if english == "":
  english = P2englishPN(pinyin)
if sanskrit != "":
  if english != "":
    english = u"%s (%s)" % (sanskrit, english)
  else:
    english = u"%s" % sanskrit

print trad
print pinyin
print english

len(tradArr) = 4
十方千五百佛名经
十方千五百佛名經
Shí Fāng qiān wŭ bǎi fó míng jīng  
Shi Fang Qian Wu Bai Fo Ming Jing


In [15]:
# Template for a dictionary entry for a Buddhist term
# Input
tid = 442
kid =0

pali = u""
japanese = u""

# References
abc = u"ABC '%s'" % pinyin
bl = u"BL '%s'" % sanskrit
bscd = u"BCSD '%s'" % traditional
ccd = u"CCD '%s'" % simplified
fe = u"FE '%s'" % traditional
fgdb = u"FGDB '%s'" % traditional
gced = u"GCED '%s'" % traditional[0]
ghdc = u"GHDC '%s'" % simplified
jebd = u"JEBD '%s'" % japanese
kdc = u"KDC %d" % kid
mw = u"MW '%s'" % sanskrit
ncced = u"NCCED '%s'" % simplified
t = u"T %d" % tid
refArr = [kdc, t] # Add based on references checked

# Generated
if grammar != "proper noun":
  pinyin = pinyin.replace(" ", "")

# Sanskrit, Pali, and Japanese equivalents
fromLang = ""
if sanskrit != "":
  fromLang = u"From Sanskrit: %s" % sanskrit
if pali != "":
  fromLang += u", Pali: %s" % pali
if japanese != "":
  fromLang += u", Japanese: %s" % japanese
if fromLang != "":
  fromLang += u"; "

# References
ref = u""
for r in refArr:
  if r != "":
    ref += r + "; "
ref = re.sub("; $", "", ref)
if ref != "":
  ref = u"(%s)" % ref

note = u"the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon "

notes = u"%s%s%s" % (fromLang, note, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

46109	十方千五百佛名经	十方千五百佛名經	Shí Fāng qiān wŭ bǎi fó míng jīng  	Shi Fang Qian Wu Bai Fo Ming Jing	proper noun	经	Sūtra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon (KDC 0; T 442)	46109


In [16]:
# Template for a dictionary entry for a monastic
concept = u"法师\tMonastic"
daterange = u"490-589; "
notes = u"%sChinese monastic who translated sutras in the Chinese Buddhist canon %s" % (daterange, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

46109	十方千五百佛名经	十方千五百佛名經	Shí Fāng qiān wŭ bǎi fó míng jīng  	Shi Fang Qian Wu Bai Fo Ming Jing	proper noun	法师	Monastic	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	490-589; Chinese monastic who translated sutras in the Chinese Buddhist canon (KDC 0; T 442)	46109


In [17]:
# Add a title from the Taisho to the dictionary and corpus
# Template for a Taisho title as a word entry in the dictionary

# Input
colophon_cn = u"""第 14 冊　No. 0442

十方千五百佛名經
失譯

共 1 卷"""
(volume, translator, nscrolls) = ExtractFromColophon(colophon_cn)
print u"Volume %d" % volume
print u"Translator %s" % translator
print u"nscrolls %d" % nscrolls
daterange = u""
genre = u"jingji"
url = u"http://tripitaka.cbeta.org/T14n0442"
dynasty = u""
concept = u"经\tSutra"

# Generated
notes = u"%sthe name of a text in the Chinese Buddhist canon %s" % (
    fromLang, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (english, traditional)
print u"          <tr>"
print u"            <td>%d</td>" % tid
print u"            <td><a href='/taisho/t0%d.html'>%s</a></td>" % (tid, title)
print u"          </tr>"
print

# Collection entry
entry = u"taisho/t0%d.csv\ttaisho/t0%d.html\t%s\tTranslated by %s\ttaisho/t0%d_00.txt\tTaishō\tSūtra\t%s\t%s" % (
    tid, tid, title, translator, tid, daterange, genre)
print entry
print

# Write list of scrolls to metadata file
scrollsDF = pd.DataFrame({"i": range(1, nscrolls + 1)})
genSource = lambda i: "taisho/t0%d_0%d.txt" % (tid, i) if (i < 10) else "taisho/t0%d_%d.txt" % (tid, i)
scrollsDF['source'] = scrollsDF['i'].map(genSource)
genOut = lambda i: "taisho/t0%d_0%d.html" % (tid, i) if (i < 10) else "taisho/t0%d_%d.html" % (tid, i)
scrollsDF['output'] = scrollsDF['i'].map(genOut)
genTitle = lambda i: "Scroll %d" % i
scrollsDF['title'] = scrollsDF['i'].map(genTitle)
filename = "../data/corpus/taisho/t0%d.csv" % tid 
cols = ['source', 'output', 'title']
scrollsDF.to_csv(filename, sep = "\t", header = False, columns = cols, index = False)

# Translation of colophon
WriteColophon(tid, colophon_cn, volume, english, traditional, url, nscrolls, kid, sanskrit, 
              translator, dynasty, daterange)

Line 1: '第 14 冊　No. 0442'
Line 4: '失譯' (2)
Line 6: '共 1 卷'
Volume 14
Translator Unknown
nscrolls 1
46109	十方千五百佛名经	十方千五百佛名經	Shí Fāng qiān wŭ bǎi fó míng jīng  	Shi Fang Qian Wu Bai Fo Ming Jing	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	the name of a text in the Chinese Buddhist canon (KDC 0; T 442)	46109

          <tr>
            <td>442</td>
            <td><a href='/taisho/t0442.html'>Shi Fang Qian Wu Bai Fo Ming Jing 《十方千五百佛名經》</a></td>
          </tr>

taisho/t0442.csv	taisho/t0442.html	Shi Fang Qian Wu Bai Fo Ming Jing 《十方千五百佛名經》	Translated by Unknown	taisho/t0442_00.txt	Taishō	Sūtra		jingji

Writing colophon to ../corpus/taisho/t0442_00.txt
