# Notebook for exploring and curating the Chinese-English Dictionary

In [1]:
import re

import pandas as pd
import numpy as np
# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
#print df['id'].count() # Number of lexical units, add drop=False to set_index() above

# Unique headwords
#print len(df['headword'].unique())

# Example row
df[df.simplified == '越']
#df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
828,828,越,\N,yuè,more,adverb,\N,\N,数量,Quantity,\N,\N,\N,yue4.mp3,Used in 越A越B constructions to mean as A increa...
828,830,越,\N,yuè,Vietnam,proper noun,国家,Country,地理,Geography,亚洲,Asia,\N,yue4.mp3,Short form for 越南.
828,13695,越,\N,yuè,to get over,verb,\N,\N,行为,Actions,\N,\N,\N,yue4.mp3,\N
828,38269,越,\N,yuè,yue,foreign,\N,\N,外语,Foreign Language,\N,\N,\N,yue4.mp3,Used to transliterate foreign names


In [2]:
fgs = pd.read_table('../data/dictionary/fgs_mwe.txt', names=colnames, dtype=types)
print fgs['id'].count() # Number of lexical units

print len(fgs['headword'].unique()) # Unique headwords

# Example row
fgs[fgs.simplified == '度一切苦厄']

3307
3236


Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
2200,1002201,度一切苦厄,\N,dù yī qiē kŭ è,Overcome All Sufferings,phrase,\N,\N,佛教,Buddhism,佛光山,Fo Guang Shan,\N,\N,Venerable Master Hsing Yun's One-Stroke Callig...,1002201


In [19]:
# Conversions for a word based on a traditional characters
# Input
luid = 45859
tradArr = ["阿彌陀", "鼓", "音聲", "王", "陀羅尼", "經"]
sanskrit = u"Aparimitāyurjñānahṛdayadhāraṇī"
english = u"Emituo Gu Yinsheng Wang Tuoluoni Jing"
grammar = "proper noun"
concept = u"\\N\t\\N"
#concept = u"佛\tBuddha"
#subdomain = u"中国佛教\tChinese Buddhism"
subdomain = u"大乘佛教\tMahāyāna Buddhism"
#subdomain = u"\\N\t\\N"

# Generated
domain = u"佛教\tBuddhism"
#domain = u"古文\tClassical Chinese"
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"
if sanskrit != "":
  english = u"%s / %s" % (sanskrit, english)
print trad
print pinyin
print english

阿弥陀鼓音声王陀罗尼经
阿彌陀鼓音聲王陀羅尼經
Āmítuó gǔ yīnshēng wáng tuóluóní jīng 
Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing


In [20]:
# Template for a dictionary entry for a Buddhist term
# Input
pali = u""
japanese = u""
bl = u"BL '%s'" % sanskrit
bscd = u"BCSD '%s'" % traditional
ccd = u"CCD '%s'" % simplified
fe = u"FE '%s'" % traditional
fgdb = u"FGDB '%s'" % traditional
ghdc = u"GHDC '%s'" % simplified
jebd = u"JEBD '%s'" % japanese
mw = u"MW '%s'" % sanskrit
t = u"T 364"
refArr = [bscd, mw] # Add based on references checked

# Generated
pinyin = pinyin.replace(" ", "")

# Sanskrit, Pali, and Japanese equivalents
fromLang = ""
if sanskrit != "":
  fromLang = u"From Sanskrit: %s" % sanskrit
if pali != "":
  fromLang += u", Pali: %s" % pali
if japanese != "":
  fromLang += u", Japanese: %s" % japanese
if fromLang != "":
  fromLang += u"; "

# References
ref = u""
for r in refArr:
  if r != "":
    ref += r + "; "
ref = re.sub("; $", "", ref)
if ref != "":
  ref = u"(%s)" % ref

note = u"a sacred text or speech, a prayer or song of praise "
if note == "":
  fromLang = fromLang.replace(";", "")

notes = u"%s%s%s" % (fromLang, note, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

45859	阿弥陀鼓音声王陀罗尼经	阿彌陀鼓音聲王陀羅尼經	Āmítuógǔyīnshēngwángtuóluóníjīng	Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing	proper noun	\N	\N	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	From Sanskrit: Aparimitāyurjñānahṛdayadhāraṇī; a sacred text or speech, a prayer or song of praise (BCSD '阿彌陀鼓音聲王陀羅尼經'; MW 'Aparimitāyurjñānahṛdayadhāraṇī')	45859


In [50]:
# Template for a dictionary entry for a monastic
concept = u"法师\tMonastic"
daterange = u"-1173; "
notes = u"%sChinese monastic who lived in the Song %s" % (daterange, ref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

45829	王日休	\N	wángrìxiū	Wang Rixiu	proper noun	法师	Monastic	佛教	Buddhism	中国佛教	Chinese Buddhism	\N	\N	-1173; Chinese monastic who lived in the Song (FGDB '王日休'; T 364)	45829


In [21]:
# Add a title from the Taisho to the dictionary and corpus
# Template for a Taisho title as a word entry in the dictionary

# Input
tid = 370
kid = 443
translator = u"Unknown"
daterange = u"502-557"
genre = u"pureland"
url = u"http://tripitaka.cbeta.org/T12n0370"
nscrolls = 1
volume = 12
dynasty = u"Liang"
datestr = u"2016-09-07"
concept = u"经\tSutra"

# Generated
kref = u""
if kid != 0:
  kref = u"; Lancaster 2004, K %d" %kid
notes = u"%sthe name of a text in the Chinese Buddhist canon (T %d%s)" % (
    fromLang, tid, kref)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (english, traditional)
print u"          <tr>"
print u"            <td>%d</td>" % tid
print u"            <td><a href='/taisho/t0%d.html'>%s</a></td>" % (tid, title)
print u"          </tr>"
print

# Collection entry
entry = u"taisho/t0%d.csv\ttaisho/t0%d.html\t%s\tTranslated by %s\ttaisho/t0%d_00.txt\tTaishō\tSūtra\t%s\t%s" % (
    tid, tid, title, translator, tid, daterange, genre)
print entry
print

# List of scrolls
#print "# Source file, Gloss output file, title"
for i in range(1, nscrolls + 1):
  print "taisho/t0%d_0%d.txt\ttaisho/t0%d_0%d.html\tScroll %d" % (tid, i, tid, i, i)
print

# Translation of colophon
kReference = u""
if kid != 0:
  kReference = u"Sanskrit title and date %s from Lancaster (Lancaster 2004, 'K %d')" % (daterange, kid)
dynastyRef = u""
if dynasty != u"":
  dynastyRef = u"Translated by %s in the %s in %d scroll(s)" % (translator, dynasty, nscrolls)
print u"Volume %d, No. %d" % (volume, tid)
print english
print dynastyRef
print
print "<h4>Notes</h4>"
print
print kReference
print
print "English translations: None"
print
print "<h4>Primary Source</h4>"
print u"%s, 《%s》 '%s,' in <i>Taishō shinshū Daizōkyō</i> 《大正新脩大藏經》, in Takakusu Junjiro, ed., (Tokyo: Taishō Shinshū Daizōkyō Kankōkai, 1988), Vol. 12, No. %d, Accessed %s, <a href='%s'>%s</a>." % (
    translator, traditional, english, tid, datestr, url, url)
print
print "<h4>References</h4>"
print """<ol><li>Lancaster, L.R. 2004, <i>The Korean Buddhist Canon: A Descriptive Catalogue</i>, <a href="http://www.acmuller.net/descriptive_catalogue/">http://www.acmuller.net/descriptive_catalogue</a>.</li></ol>"""

45859	阿弥陀鼓音声王陀罗尼经	阿彌陀鼓音聲王陀羅尼經	Āmítuógǔyīnshēngwángtuóluóníjīng	Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	From Sanskrit: Aparimitāyurjñānahṛdayadhāraṇī; the name of a text in the Chinese Buddhist canon (T 370; Lancaster 2004, K 443)	45859

          <tr>
            <td>370</td>
            <td><a href='/taisho/t0370.html'>Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing 《阿彌陀鼓音聲王陀羅尼經》</a></td>
          </tr>

taisho/t0370.csv	taisho/t0370.html	Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing 《阿彌陀鼓音聲王陀羅尼經》	Translated by Unknown	taisho/t0370_00.txt	Taishō	Sūtra	502-557	pureland

taisho/t0370_01.txt	taisho/t0370_01.html	Scroll 1

Volume 12, No. 370
Aparimitāyurjñānahṛdayadhāraṇī / Emituo Gu Yinsheng Wang Tuoluoni Jing
Translated by Unknown in the Liang in 1 scroll(s)

<h4>Notes</h4>

Sanskrit title and date 502-557 from Lancaster (Lancaster 2004, 'K 443')

