# Notebook for exploring and curating the Chinese-English Dictionary

In [66]:
import pandas as pd
import numpy as np
# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
index_col = ['headword', 'id']
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types,
                  index_col = index_col)
#print df['id'].count() # Number of lexical units, add drop=False to set_index() above

# Unique headwords
#print len(df['headword'].unique())

# Example row
df[df.simplified == '越']
#df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes
headword,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
828,828,越,\N,yuè,more,adverb,\N,\N,数量,Quantity,\N,\N,\N,yue4.mp3,Used in 越A越B constructions to mean as A increa...
828,830,越,\N,yuè,Vietnam,proper noun,国家,Country,地理,Geography,亚洲,Asia,\N,yue4.mp3,Short form for 越南.
828,13695,越,\N,yuè,to get over,verb,\N,\N,行为,Actions,\N,\N,\N,yue4.mp3,\N
828,38269,越,\N,yuè,yue,foreign,\N,\N,外语,Foreign Language,\N,\N,\N,yue4.mp3,Used to transliterate foreign names


In [58]:
fgs = pd.read_table('../data/dictionary/fgs_mwe.txt', names=colnames, dtype=types)
print fgs['id'].count() # Number of lexical units

print len(fgs['headword'].unique()) # Unique headwords

# Example row
fgs[fgs.simplified == '度一切苦厄']

3307
3236


Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
2200,1002201,度一切苦厄,\N,dù yī qiē kŭ è,Overcome All Sufferings,phrase,\N,\N,佛教,Buddhism,佛光山,Fo Guang Shan,\N,\N,Venerable Master Hsing Yun's One-Stroke Callig...,1002201


In [67]:
# Conversions for a word based on a traditional characters
tradArr = ["須摩提", "菩薩", "經"]
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified
trad = traditional
if simplified == traditional:
  trad = "\\N"
print trad
print pinyin

须摩提菩萨经
須摩提菩薩經
Xūmótí púsà jīng 


In [60]:
# Template for a dictionary entry for a monastic
grammar = "proper noun"
concept = u"法师\tMonastic"
notes = u"Buddhist monastic"
luid = 45709
english = u"Vīradattagṛhapatiparipṛcchā"
grammar = "proper noun"
domain = u"佛教\tBuddhism"
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)

45709	大乘日子王所问经	大乘日子王所問經	Dàshèng rìzi wáng suǒ wèn jīng 	Vīradattagṛhapatiparipṛcchā	proper noun	法师	Monastic	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	Buddhist monastic	45709


In [69]:
# Add a title from the Taisho to the dictionary and corpus
# Template for a Taisho title as a word entry in the dictionary

# Change these values
luid = 45715
english = u"Sumatidārikāparipṛcchā (Xumoti Pusa Jing)"
tid = 334
kid = 39
notes = u"From Sanskrit: sumatidārikāparipṛcchā; a text in the Chinese Buddhist canon (T %d; Lancaster 2004, K %d)" % (
    tid, kid)
translator = u"Dharmarakṣa"
daterange = u"265-313"
genre = u"ratnakūṭa"
url = u"http://tripitaka.cbeta.org/T12n0334"
nscrolls = 1
volume = 12
dynasty = u"Western Jin"

grammar = "proper noun"
domain = u"佛教\tBuddhism"
concept = u"经\tSutra"
subdomain = u"大乘佛教\tMahāyāna Buddhism"
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, trad, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (english, traditional)
print title
print

# Collection entry
entry = u"taisho/t0%d.csv\ttaisho/t0%d.html\t%s\tTranslated by %s\ttaisho/t0%d_00.txt\tTaishō\tSūtra\t%s\t%s" % (
    tid, tid, title, translator, tid, daterange, genre)
print entry
print

# List of scrolls
#print "# Source file, Gloss output file, title"
for i in range(1, nscrolls + 1):
  print "taisho/t0%d_0%d.txt\ttaisho/t0%d_0%d.html\tScroll %d" % (tid, i, tid, i, i)
print

# Translation of colophon
print u"Volume %d, No. %d" % (volume, tid)
print english
print u"Translated by %s in the %s in %d scroll(s)" % (translator, dynasty, 3)
print
print "<h4>Notes</h4>"
print
print u"Sanskrit title and date %s from Lancaster (Lancaster 2004, 'K %d')" % (daterange, kid)
print
print "English translations: None"
print
print "<h4>Primary Source</h4>"
print u"%s, 《%s》 '%s,' in <i>Taishō shinshū Daizōkyō</i> 《大正新脩大藏經》, in Takakusu Junjiro, ed., (Tokyo: Taishō Shinshū Daizōkyō Kankōkai, 1988), Vol. 12, No. %d, Accessed 2016-08-31, <a href='%s'>%s</a>." % (
    translator, traditional, english, tid, url, url)
print
print "<h4>References</h4>"
print """<ol><li>Lancaster, L.R. 2004, <i>The Korean Buddhist Canon: A Descriptive Catalogue</i>, <a href="http://www.acmuller.net/descriptive_catalogue/">http://www.acmuller.net/descriptive_catalogue</a>.</li></ol>"""


45715	须摩提菩萨经	須摩提菩薩經	Xūmótí púsà jīng 	Sumatidārikāparipṛcchā (Xumoti Pusa Jing)	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	From Sanskrit: sumatidārikāparipṛcchā; a text in the Chinese Buddhist canon (T 334; Lancaster 2004, K 39)	45715

Sumatidārikāparipṛcchā (Xumoti Pusa Jing) 《須摩提菩薩經》

taisho/t0334.csv	taisho/t0334.html	Sumatidārikāparipṛcchā (Xumoti Pusa Jing) 《須摩提菩薩經》	Translated by Dharmarakṣa	taisho/t0334_00.txt	Taishō	Sūtra	265-313	ratnakūṭa

taisho/t0334_01.txt	taisho/t0334_01.html	Scroll 1

Volume 12, No. 334
Sumatidārikāparipṛcchā (Xumoti Pusa Jing)
Translated by Dharmarakṣa in the Western Jin in 3 scroll(s)

<h4>Notes</h4>

Sanskrit title and date 265-313 from Lancaster (Lancaster 2004, 'K 39')

English translations: None

<h4>Primary Source</h4>
Dharmarakṣa, 《須摩提菩薩經》 'Sumatidārikāparipṛcchā (Xumoti Pusa Jing),' in <i>Taishō shinshū Daizōkyō</i> 《大正新脩大藏經》, in Takakusu Junjiro, ed., (Tokyo: Taishō Shinshū Daizōkyō Kankōkai, 1988), Vol. 12, No. 334, Ac