# Notebook for exploring the Chinese-English Buddhist Dictionary

In [28]:
import re

import pandas as pd
import numpy as np

from curation_util import ExtractFromColophon
from curation_util import ExtractWords
from curation_util import InsertIntoVolume
from curation_util import P2englishPN
from curation_util import WriteCollectionEntry
from curation_util import WriteColophon
from korean import getkoreanid
from taisho import geturl
from taisho import saveScrolls

# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
cols = ["simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == '藥師琉璃光如來本願功德經'][cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [29]:
# Conversions for a word based on a traditional characters
# Input
title_cn = u"佛說捺女祗域因緣經"
sanskrit = u"Mātangīsūtra"
english = u""
grammar = "proper noun"
concept = u"\\N\t\\N"
#concept = u"佛\tBuddha"
#concept = u"菩萨\tBodhisattva"
concept = u"经\tSūtra"
#subdomain = u"中国佛教\tChinese Buddhism"
#subdomain = u"\\N\t\\N"
subdomain = u"大乘佛教\tMahāyāna Buddhism"

# Generated
luid = np.amax(df.index.values)[0] + 1
tradArr = ExtractWords(title_cn)
if len(tradArr) == 1:
  print "Existing word"
else:
  print "New word"
domain = u"佛教\tBuddhism"
#domain = u"古文\tClassical Chinese"
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"

# Do some guessing on the best English name
title_en = u""
if english == "":
  english = P2englishPN(pinyin)
if sanskrit != "":
  if english != "":
    english, title_en = u"%s / %s" % (sanskrit, english), u"%s (%s)" % (sanskrit, english)
  else:
    english = u"%s" % sanskrit
if title_en == "":
  title_en = english

print trad
print pinyin
print english

New word
佛说捺女祗域因缘经
佛說捺女祗域因緣經
Fó shuō nà nǚ zhī yù yīnyuán jīng 
Mātangīsūtra / Fo Shuo Na Nu Zhi Yu Yinyuan Jing


In [30]:
# Template for a dictionary entry for a Buddhist term
# Input
colophon_cn = u"""第 14 冊　No. 0553

佛說捺女祇域因緣經
後漢 安世高譯

共 1 卷"""
(volume, tid, nscrolls, translator, dynasty) = ExtractFromColophon(colophon_cn)
kid =getkoreanid(tid)

pali = u""
japanese = u""

# References
abc = u"ABC '%s'" % pinyin
bl = u"BL '%s'" % sanskrit
bscd = u"BCSD '%s'" % traditional
ccd = u"CCD '%s'" % simplified
fe = u"FE '%s'" % traditional
fgdb = u"FGDB '%s'" % traditional
gced = u"GCED '%s'" % traditional[0]
ghdc = u"GHDC '%s'" % simplified
jebd = u"JEBD '%s'" % japanese
kdc = u"KDC %s" % kid
mw = u"MW '%s'" % sanskrit
ncced = u"NCCED '%s'" % simplified
t = u"T %s" % tid
if kid != "0":
  refArr = [kdc, t] # Add based on references checked
else:
  refArr = [t]

# Generated
if grammar != "proper noun":
  pinyin = pinyin.replace(" ", "")

# Sanskrit, Pali, and Japanese equivalents
fromLang = ""
if sanskrit != "":
  fromLang = u"From Sanskrit: %s" % sanskrit
if pali != "":
  fromLang += u", Pali: %s" % pali
if japanese != "":
  fromLang += u", Japanese: %s" % japanese
if fromLang != "":
  fromLang += u"; "

# References
ref = u""
for r in refArr:
  if r != "":
    ref += r + "; "
ref = re.sub("; $", "", ref)
if ref != "":
  ref = u"(%s)" % ref

note = u"the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon "

notes = u"%s%s%s" % (fromLang, note, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

46380	佛说捺女祗域因缘经	佛說捺女祗域因緣經	Fó shuō nà nǚ zhī yù yīnyuán jīng 	Mātangīsūtra / Fo Shuo Na Nu Zhi Yu Yinyuan Jing	proper noun	经	Sūtra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	From Sanskrit: Mātangīsūtra; the name of a bodhisattva who is central in the 《阿差末菩薩經》Akṣayamatinirdeśasūtra in the Chinese Buddhist canon (KDC 782; T 553)	46380


In [31]:
# Template for a dictionary entry for a monastic
concept = u"法师\tMonastic"
daterange = u""
notes = u"%sChinese monastic who translated sutras included in the Chinese Buddhist canon %s" % (daterange, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

46380	佛说捺女祗域因缘经	佛說捺女祗域因緣經	Fó shuō nà nǚ zhī yù yīnyuán jīng 	Mātangīsūtra / Fo Shuo Na Nu Zhi Yu Yinyuan Jing	proper noun	法师	Monastic	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	Chinese monastic who translated sutras included in the Chinese Buddhist canon (KDC 782; T 553)	46380


In [32]:
# Add a title from the Taisho to the dictionary and corpus
# Template for a Taisho title as a word entry in the dictionary

# Input
daterange = u"148-170"
url = geturl(volume, tid)
genre = u"jingji"
concept = u"经\tSutra"

# Generated
print u"Volume %d" % volume
print u"nscrolls %d" % nscrolls
print u"Translator %s" % translator
print u"dynasty %s" % dynasty

# Add a separate entry if the title begins with 佛說
if traditional[:2] == u"佛說":
  trad = traditional[2:]
  notes = u"See %s" % trad
  print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
  luid += 1
  simplified = simplified[2:]
  pinyin = pinyin.replace(u"Fó shuō ", "").strip()
  english = english.replace(u"Fo Shuo ", "").strip()

the = u"the"
if fromLang == "":
  the = u"The"
notes = u"%s%s name of a text in the Chinese Buddhist canon %s" % (
    fromLang, the, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (title_en, traditional)
title_html = u"          <tr>\n"
title_html += u"            <td>%s</td>\n" % tid
title_html += u"            <td><a href='/taisho/t0%s.html'>%s</a></td>\n" % (tid, title)
title_html += u"          </tr>\n\n"
title_html += u"          {{next_title}}"
InsertIntoVolume(volume, title_html)

# Collection entry
WriteCollectionEntry(tid, title, translator, daterange, genre)

# Write list of scrolls to metadata file
scrollsDF = pd.DataFrame({"i": range(1, nscrolls + 1)})
genSource = lambda i: "taisho/t0%s_0%s.txt" % (tid, i) if (i < 10) else "taisho/t0%s_%s.txt" % (tid, i)
scrollsDF['source'] = scrollsDF['i'].map(genSource)
genOut = lambda i: "taisho/t0%s_0%d.html" % (tid, i) if (i < 10) else "taisho/t0%s_%d.html" % (tid, i)
scrollsDF['output'] = scrollsDF['i'].map(genOut)
genTitle = lambda i: "Scroll %d" % i
scrollsDF['title'] = scrollsDF['i'].map(genTitle)
filename = "../data/corpus/taisho/t0%s.csv" % tid 
cols = ['source', 'output', 'title']
scrollsDF.to_csv(filename, sep = "\t", header = False, columns = cols, index = False)

# Translation of colophon
WriteColophon(tid, colophon_cn, volume, title_en, traditional, url, nscrolls, kid, sanskrit, 
              translator, dynasty, daterange)
saveScrolls(volume, tid, nscrolls, traditional)

Volume 14
nscrolls 1
Translator An Shigao
dynasty Later Han
46380	佛说捺女祗域因缘经	佛說捺女祗域因緣經	Fó shuō nà nǚ zhī yù yīnyuán jīng 	Mātangīsūtra / Fo Shuo Na Nu Zhi Yu Yinyuan Jing	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	See 捺女祗域因緣經	46380
46381	捺女祗域因缘经	捺女祗域因緣經	nà nǚ zhī yù yīnyuán jīng	Mātangīsūtra / Na Nu Zhi Yu Yinyuan Jing	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	From Sanskrit: Mātangīsūtra; the name of a text in the Chinese Buddhist canon (KDC 782; T 553)	46381

Writing colophon to ../corpus/taisho/t0553_00.txt
saveScrollFromWeb, title: 佛說捺女祗域因緣經
http://tripitaka.cbeta.org/T14n0553_001
