# Notebook for exploring the Chinese-English Buddhist Dictionary

In [1]:
import re

import pandas as pd
import numpy as np

import curation_util
from korean import getkoreanid
from taisho import geturl
from taisho import saveScrolls

# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
cols = ["simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == '脡'][cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46847,46847,脡,\N,tǐng,strips of dried meat / jerky,(Unihan '脡')
46847,46863,脡,\N,tǐng,stiff and straight,(Unihan '脡')


In [2]:
# Load an entry from the Taisho canon
# Input
tid = "983B"

# Generated
entry = curation_util.GetEntry(tid)
title_cn = entry["title"]
print title_cn


Translator 圓仁請 not in dictionary
GetTranslatorEn: Translator '圓仁請' not found
孔雀經真言等梵本


In [3]:
# Conversions for a word based on a traditional characters
#title_cn = u"胎藏界"
sanskrit = u"Mahāmāyūrīvidyārājñī"
english = u""
grammar = "noun"
concept = u"\\N\t\\N"
#concept = u"佛\tBuddha"
#concept = u"菩萨\tBodhisattva"
concept = u"经\tSūtra"
#subdomain = u"中国佛教\tChinese Buddhism"
subdomain = u"\\N\t\\N"
#subdomain = u"大乘佛教\tMahāyāna Buddhism"
#subdomain = u"密教\tEsoteric Buddhism"

# Generated
luid = df.count()[0] + 1 #np.amax(df.index.values)[0] + 1
print luid
tradArr = curation_util.ExtractWords(title_cn)
if len(tradArr) == 1:
  print "Existing word"
else:
  print "New word"
domain = u"佛教\tBuddhism"
#domain = u"古文\tClassical Chinese"
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"

# Do some guessing on the best English name
title_en = u""
if english == "":
  english = curation_util.P2englishPN(pinyin)
if sanskrit != "":
  if english != "":
    english, title_en = u"%s / %s" % (sanskrit, english), u"%s (%s)" % (sanskrit, english)
  else:
    english = u"%s" % sanskrit
if title_en == "":
  title_en = english

print trad
print pinyin
print english

47822
New word
孔雀经真言等梵本
孔雀經真言等梵本
kǒngquè jīng zhēnyán děng Fàn běn 
Mahāmāyūrīvidyārājñī / Kongque Jing Zhenyan Deng Fan Ben


In [4]:
# Template for a dictionary entry for a Buddhist term
# Input
colophon_cn = u"""第 %d 冊　No. %s

%s
%s%s %s%s

共 %d 卷"""
volume = entry["volume"]
dynasty_cn = entry["dynasty"]
dynasty_en = entry["dynasty_en"]
translator_cn = entry["translator"]
translator_en = entry["translator_en"]
nscrolls = entry["nscrolls"]
compiledby_cn = entry["compiledby_cn"]
compiledby_en = entry["compiledby_en"]
if compiledby_cn != "":
  compiledby_cn += " "
how_cn = entry["how_cn"]
how_en = entry["how_en"]
colophon_cn = colophon_cn % (volume, tid, title_cn, compiledby_cn, dynasty_cn, translator_cn, how_cn, nscrolls)
#(volume, tid, nscrolls, translator_en, dynasty) = curation_util.ExtractFromColophon(colophon_cn)
kid = getkoreanid(tid)

pali = u""
japanese = u""

# References
abc = u"ABC '%s'" % pinyin
bl = u"BL '%s'" % sanskrit
bscd = u"BCSD '%s'" % traditional
ccd = u"CCD '%s'" % simplified
fe = u"FE '%s'" % traditional
fgdb = u"FGDB '%s'" % traditional
gced = u"GCED '%s'" % traditional[0]
ghdc = u"GHDC '%s'" % simplified
jebd = u"JEBD '%s'" % japanese
kdc = u"KDC %s" % kid
mw = u"MW '%s'" % sanskrit
ncced = u"NCCED '%s'" % simplified
t = u"T %s" % tid
if kid != "0":
  refArr = [kdc, t] # Add based on references checked
else:
  refArr = [t]

# Generated
if grammar != "proper noun":
  pinyin = pinyin.replace(" ", "")

# Sanskrit, Pali, and Japanese equivalents
fromLang = ""
if sanskrit != "":
  fromLang = u"From Sanskrit: %s" % sanskrit
if pali != "":
  fromLang += u", Pali: %s" % pali
if japanese != "":
  fromLang += u", Japanese: %s" % japanese
if fromLang != "":
  fromLang += u"; "

# References
ref = u""
for r in refArr:
  if r != "":
    ref += r + "; "
ref = re.sub("; $", "", ref)
if ref != "":
  ref = u"(%s)" % ref

note = u"the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon "

notes = u"%s%s%s" % (fromLang, note, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

47822	孔雀经真言等梵本	孔雀經真言等梵本	kǒngquèjīngzhēnyánděngFànběn	Mahāmāyūrīvidyārājñī / Kongque Jing Zhenyan Deng Fan Ben	noun	经	Sūtra	佛教	Buddhism	\N	\N	\N	\N	From Sanskrit: Mahāmāyūrīvidyārājñī; the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon (BL 'Mahāmāyūrīvidyārājñī'; FGDB '孔雀經真言等梵本')	47822


In [5]:
# Template for a dictionary entry for a monastic
#concept = u"法师\tMonastic"
concept = u"\\N\t\\N"
daterange = u""
notes = u"%sBuddhist monastic who translated texts included in the Chinese Buddhist canon %s" % (daterange, ref)
#notes = u"See 曼荼羅"
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

47822	孔雀经真言等梵本	孔雀經真言等梵本	kǒngquèjīngzhēnyánděngFànběn	Mahāmāyūrīvidyārājñī / Kongque Jing Zhenyan Deng Fan Ben	noun	\N	\N	佛教	Buddhism	\N	\N	\N	\N	Buddhist monastic who translated texts included in the Chinese Buddhist canon (BL 'Mahāmāyūrīvidyārājñī'; FGDB '孔雀經真言等梵本')	47822


In [6]:
# Add a title from the Taisho to the dictionary and corpus
# Template for a Taisho title as a word entry in the dictionary

# Input
daterange = u""
if kid == "0" and daterange == "":
  dateranges = {"Later Han": "25-220",
                "Liang": "502-557",
                "Liu Song": "420-479",
                "Tang": "618-907",
                "Song": "960-1279",
                "Yuan": "1279-1368",
                "Ming": "1368-1644",
                "Qing": "1644-1911",
                "Korean Goryeo": "918-1392"}
  if dynasty_en in dateranges:
    daterange = dateranges[dynasty_en]
url = geturl(volume, tid)
genre = u"esoteric"
concept = u"经\tSutra"

# Add a separate entry if the title begins with 佛說
if traditional[:2] == u"佛說":
  trad = traditional[2:]
  notes = u"See %s" % trad
  print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
  luid += 1
  simplified = simplified[2:]
  pinyin = pinyin.replace(u"Fó shuō ", "").strip()
  english = english.replace(u"Fo Shuo ", "").strip()

the = u"the"
if fromLang == "":
  the = u"The"
notes = u"%s%s name of a text in the Chinese Buddhist canon %s" % (
    fromLang, the, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (title_en, traditional)
title_html = u"          <tr>\n"
title_html += u"            <td>%s</td>\n" % tid
title_html += u"            <td><a href='/taisho/t0%s.html'>%s</a></td>\n" % (tid, title)
title_html += u"          </tr>\n\n"
title_html += u"          {{next_title}}"
curation_util.InsertIntoVolume(volume, title_html)

# Collection entry
curation_util.WriteCollectionEntry(tid, title, translator_en, daterange, genre, how_en)

# Write list of scrolls to metadata file
scrollsDF = pd.DataFrame({"i": range(1, nscrolls + 1)})
genSource = lambda i: "taisho/t0%s_0%s.txt" % (tid, i) if (i < 10) else "taisho/t0%s_%s.txt" % (tid, i)
scrollsDF['source'] = scrollsDF['i'].map(genSource)
genOut = lambda i: "taisho/t0%s_0%d.html" % (tid, i) if (i < 10) else "taisho/t0%s_%d.html" % (tid, i)
scrollsDF['output'] = scrollsDF['i'].map(genOut)
genTitle = lambda i: "Scroll %d" % i
scrollsDF['title'] = scrollsDF['i'].map(genTitle)
filename = "../data/corpus/taisho/t0%s.csv" % tid 
cols = ['source', 'output', 'title']
scrollsDF.to_csv(filename, sep = "\t", header = False, columns = cols, index = False)

# Translation of colophon
curation_util.WriteColophon(tid, colophon_cn, volume, title_en, traditional, url, nscrolls, kid, sanskrit, 
              translator_en, dynasty_en, daterange, compiledby_en, how_en)
saveScrolls(volume, tid, nscrolls, traditional)

47822	孔雀经真言等梵本	孔雀經真言等梵本	kǒngquèjīngzhēnyánděngFànběn	Mahāmāyūrīvidyārājñī / Kongque Jing Zhenyan Deng Fan Ben	noun	经	Sutra	佛教	Buddhism	\N	\N	\N	\N	From Sanskrit: Mahāmāyūrīvidyārājñī; the name of a text in the Chinese Buddhist canon (BL 'Mahāmāyūrīvidyārājñī'; FGDB '孔雀經真言等梵本')	47822

Writing colophon to ../corpus/taisho/t0983B_00.txt
saveScrollFromWeb, title: 孔雀經真言等梵本
http://tripitaka.cbeta.org/T19n0983B_001
juanname' '孔雀經真言等梵本', tidStr = '0983B'
start, juanname: 孔雀經真言等梵本, m.group(2): 
