# Notebook for exploring and curating the Chinese-English Dictionary

In [50]:
import pandas as pd
import numpy as np
# Load words table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': unicode, 'traditional': unicode, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types)
print df['id'].count() # Number of lexical units

# Unique headwords
print len(df['headword'].unique())

# Example row
df[df.simplified == '越']

45697
37196


Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
827,828,越,\N,yuè,more,adverb,\N,\N,数量,Quantity,\N,\N,\N,yue4.mp3,Used in 越A越B constructions to mean as A increa...,828
829,830,越,\N,yuè,Vietnam,proper noun,国家,Country,地理,Geography,亚洲,Asia,\N,yue4.mp3,Short form for 越南.,828
13694,13695,越,\N,yuè,to get over,verb,\N,\N,行为,Actions,\N,\N,\N,yue4.mp3,\N,828
38268,38269,越,\N,yuè,yue,foreign,\N,\N,外语,Foreign Language,\N,\N,\N,yue4.mp3,Used to transliterate foreign names,828


In [48]:
fgs = pd.read_table('../data/dictionary/fgs_mwe.txt', names=colnames, dtype=types)
print fgs['id'].count() # Number of lexical units

print len(fgs['headword'].unique()) # Unique headwords

# Example row
fgs[fgs.simplified == '度一切苦厄']

3307
3236


Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
2200,1002201,度一切苦厄,\N,dù yī qiē kŭ è,Overcome All Sufferings,phrase,\N,\N,佛教,Buddhism,佛光山,Fo Guang Shan,\N,\N,Venerable Master Hsing Yun's One-Stroke Callig...,1002201


In [64]:
# Add a title from the Taisho to the dictionary and corpus
tradArr = ["佛說", "須", "賴", "經"]
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['traditional']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
print simplified 
print traditional
print pinyin

佛说须赖经
佛說須賴經
Fó shuō xū lài jīng 


In [65]:
# Add a Taisho title
# As a word entry in the dictionary

luid = 45702
tid = 328
kid = 0
english = u"Fo Shuo Xulai Jing"
grammar = "proper noun"
concept = u"经\tSutra"
domain = u"佛教\tBuddhism"
subdomain = u"大乘佛教\tMahāyāna Buddhism"
notes = u"A text in the Chinese Buddhist canon (T %d; Lancaster 2004, K %d)" % (
    tid, kid)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, concept, domain, subdomain, notes, luid)
print

# Full title of the Taisho entry
title = u"%s 《%s》" % (english, traditional)
print title
print

# Collection entry
translator = u"Bai Yan"
daterange = u"220-265"
genre = u"ratnakūṭa"
entry = u"taisho/t0%d.csv\ttaisho/t0%d.html\t%s\tTranslated by %s\ttaisho/t0%d_00.txt\tTaishō\tSūtra\t%s\t%s" % (
    tid, tid, title, translator, tid, daterange, genre)
print entry
print

# Primary source reference
url = u"http://tripitaka.cbeta.org/T12n0328"
reference = u"%s, 《%s》 '%s,' in <i>Taishō shinshū Daizōkyō</i> 《大正新脩大藏經》, in Takakusu Junjiro, ed., (Tokyo: Taishō Shinshū Daizōkyō Kankōkai, 1988), Vol. 12, No. %d, Accessed 2016-08-31, <a href='%s'>%s</a>." % (
    translator, traditional, english, tid, url, url)
print reference

45702	佛说须赖经	佛說須賴經	Fó shuō xū lài jīng 	Fo Shuo Xulai Jing	proper noun	经	Sutra	佛教	Buddhism	大乘佛教	Mahāyāna Buddhism	\N	\N	A text in the Chinese Buddhist canon (T 328; Lancaster 2004, K 0)	45702

Fo Shuo Xulai Jing 《佛說須賴經》

taisho/t0328.csv	taisho/t0328.html	Fo Shuo Xulai Jing 《佛說須賴經》	Translated by Bai Yan	taisho/t0328_00.txt	Taishō	Sūtra	220-265	ratnakūṭa

Bai Yan, 《佛說須賴經》 'Fo Shuo Xulai Jing,' in <i>Taishō shinshū Daizōkyō</i> 《大正新脩大藏經》, in Takakusu Junjiro, ed., (Tokyo: Taishō Shinshū Daizōkyō Kankōkai, 1988), Vol. 12, No. 328, Accessed 2016-08-31, <a href='http://tripitaka.cbeta.org/T12n0328'>http://tripitaka.cbeta.org/T12n0328</a>.
