# Notebook for adding a term to the Chinese-English Buddhist Dictionary

In [1]:
import re

import pandas as pd
import numpy as np

import curation_util
from korean import getkoreanid
from taisho import geturl
from taisho import saveScrolls

# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
cols = ["simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == '脡'][cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46847,46847,脡,\N,tǐng,strips of dried meat / jerky,(Unihan '脡')
46847,46863,脡,\N,tǐng,stiff and straight,(Unihan '脡')


In [2]:
# Conversions for a word based on a traditional characters
tradArr = ["妙", "莊嚴", "王"]
sanskrit = u""
english = u"King Wonderful Adornment"
grammar = "noun"
concept = u"\\N\t\\N"
#concept = u"书名\tBook Title"
#concept = u"论\tTreatise"
#concept = u"典籍\tCanonical Text"
#concept = u"经疏\tSūtra Commentary"
#concept = u"论疏\tŚastra Commentary"
#concept = u"律疏\tVinaya Commentary"
#concept = u"佛\tBuddha"
#concept = u"菩萨\tBodhisattva"
#concept = u"经\tSūtra"
#concept = u"陀罗尼\tDhāraṇī"
#concept = u"寺院\tTemple"
#subdomain = u"中国佛教\tChinese Buddhism"
subdomain = u"\\N\t\\N"
#subdomain = u"大乘佛教\tMahāyāna Buddhism"
#subdomain = u"密教\tEsoteric Buddhism"
#subdomain = u"中国佛教\tChinese Buddhism"
#subdomain = u"印度佛教\tIndian Buddhism"
#subdomain = u"佛光山\tFo Guang Shan"

# Generated
luid = df.count()[0] + 1 #np.amax(df.index.values)[0] + 1
print luid

domain = u"古文\tClassical Chinese"
#domain = u"佛教\tBuddhism"
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin = pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"
if grammar == "proper noun":
  pinyin = pinyin.title()

# Do some guessing on the best English name
title_en = u""
if english == "":
  english = curation_util.P2englishPN(pinyin)
if sanskrit != "":
  if english != "":
    english, title_en = u"%s / %s" % (sanskrit, english), u"%s (%s)" % (sanskrit, english)
  else:
    english = u"%s" % sanskrit
if title_en == "":
  title_en = english

print trad
print pinyin
print english

51335
白鲟
白鱘
bái xún
Chinese paddlefish


In [3]:
# Input
pali = u""
japanese = u""

# References
abc = u"ABC '%s'" % pinyin
bingenheimer = "Bingenheimer 2016"
bl = u"BL '%s'" % english
bsad = u"BSAD ID：PL000000015604"
bscd = u"BCSD '%s'" % traditional
ccd = u"CCD '%s'" % simplified
cedict = u"CC-CEDICT '%s'" % traditional
djbt = u"DJBT '%s'" % traditional
fe = u"FE '%s'" % traditional
fgdb = u"FGDB '%s'" % traditional
gced = u"GCED '%s'" % traditional
ghdc = u"GHDC '%s'" % simplified
jebd = u"JEBD '%s'" % japanese
mw = u"MW '%s'" % sanskrit
ncced = u"NCCED '%s'" % simplified
tgn = u"TGN ID: 1128933"
wc = u"WorldCat '%s'" % english
wiki= u"Wikipedia '%s'" % english
refArr = [wiki]

# Generated
if grammar != "proper noun":
  pinyin = pinyin.replace(" ", "")

# Sanskrit, Pali, and Japanese equivalents
fromLang = ""
if sanskrit != "":
  fromLang = u"From Sanskrit: %s" % sanskrit
if pali != "":
  fromLang += u", Pali: %s" % pali
if japanese != "":
  fromLang += u", Japanese: %s" % japanese
if fromLang != "":
  fromLang += u"; "

# References
ref = u""
for r in refArr:
  if r != "":
    ref += r + "; "
ref = re.sub("; $", "", ref)
if ref != "":
  ref = u"(%s)" % ref

note = u"Species: Psephurus gladius "

notes = u"%s%s%s" % (fromLang, note, ref)
#notes = u"See 長江三角洲"
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

51335	白鲟	白鱘	báixún	Chinese paddlefish	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	Species: Psephurus gladius (Wikipedia 'Chinese paddlefish')	51335
