# Notebook to add or modify dictionary entries

In [583]:
import re
import pandas as pd
import numpy as np

# Load lexical units table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
df = pd.read_table('../data/words.txt', names=colnames, dtype=types)
# Words with no notes
df.ix[df.notes == '\N'][:5]

Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
343,344,途中,\N,túzhōng,en route,noun,\N,\N,移动,Movement,\N,\N,\N,tu2zhong1.mp3,\N,344
344,345,停留,\N,tíngliú,stay somewhere temporarily / stop over,verb,\N,\N,状况,Condition,\N,\N,\N,ting2liu2.mp3,\N,345
348,349,重量,\N,zhòngliàng,weight,noun,\N,\N,数量,Quantity,\N,\N,\N,zhong4liang4.mp3,\N,349
351,352,国会议员,國會議員,Guóhuì yìyuán,Member of Congress,pronoun,\N,\N,政治,Politics,\N,\N,\N,guo2hui4yi4yuan2.mp3,\N,352
352,353,超级,超級,chāojí,super,adjective,\N,\N,规模,Scale,\N,\N,\N,chao1ji2.mp3,\N,353


In [584]:
# Lookup an entry
cols = ["id", "simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == '澧州'][cols]

Unnamed: 0,id,simplified,traditional,pinyin,english,notes
51468,51469,澧州,\N,Lǐzhōu,Lizhou,A historic place in present-day Hunan (Tan 199...


In [585]:
#Add a new entry
# Find simplified and pinyin from a traditional text string

#Input
tradArr = ["市", "中心"]
english = [u"city center / downtown"
          ]
grammar = ["phrase"
        ]
# Concept
empty = u"\\N\t\\N"
book = u"书名\tBook Title"
city = u"城市\tCity"
county = u"县\tCounty"
monastic = u"法师\tMonastic"
mountain = u"山\tMountain"
peak = u"峰\tPeak"
place = u"地名\tPlace Name"
reign = u"年号\tReign Name"
concept = [empty
          ]

# Domain
architecture = u"建筑学\tArchitecture"
art = u"艺术\tArt"
aviation = u"航空\tAviation"
buddhism = u"佛教\tBuddhism"
christianity = u"基督教\tChristianity"
classical = u"古文\tClassical Chinese"
clothing = u"服装\tClothing"
commerce = u"商务\tCommerce"
comparison = u"比较\tComparison"
condition = u"状况\tCondition"
emotion = u"感情\tEmotion"
family = u"家\tFamily"
fire = u"火\tFire"
food = u"饮食\tFood and Drink"
function = u"虚词\tFunction Words"
geography = u"地理\tGeography"
health = u"健康\tHealth"
history = u"历史\tHistory"
idiom = u"成语\tIdiom"
light = u"光纤\tLight"
language = u"语言\tLanguage"
law = u"法律\tLaw"
leisure = u"休闲\tLeisure"
media = u"媒体\tMedia"
medical = u"医疗\tMedicine"
military = u"军事\tMilitary"
mythology = u"神话\tMythology"
nature = u"大自然\tNature"
organization = u"组织\tOrganization"
people = u"人\tPeople"
places = u"地方\tPlaces"
poetry = u"诗\tPoetry"
politics = u"政治\tPolitics"
position = u"位\tPosition"
process = u"过程\tProcess"
religion = u"宗教\tReligion"
society = u"社会\tSociety"
sound = u"声\tSound"
spoken = u"口语\tSpoken Language"
taoism = u"道教\tTaoism"
time = u"时间\tTime"
thought = u"思想\tThought"
transport = u"交通\tTransportation"
writing = u"写作\tWriting"
domain = [geography]

subdomain = u"\\N\t\\N"
#subdomain = u"中国\tChina"
#subdomain = u"中国佛教\tChinese Buddhism"
#subdomain = u"禅宗\tChan"
#subdomain = u"佛光山\tFo Guang Shan"
#subdomain = u"宋朝\tSong"
note = [u""
       ]

# Generated
luid = df.count()[0] + 1 #np.amax(df.index.values)[0] + 1
print luid
headword = luid
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
trad = traditional
if simplified == traditional:
  trad = "\\N"
pinyin = pinyin.replace(" ", "")
print simplified
print trad
print pinyin
print "English: %d" % len(english)
print "Grammar: %d" % len(grammar)
print "Concept: %d" % len(concept)
print "Domain: %d" % len(domain)
print "Note: %d" % len(note)

52096
市中心
\N
shìzhōngxīn
English: 1
Grammar: 1
Concept: 1
Domain: 1
Note: 1


In [586]:
# Input - 
#pinyin = u"tài" # override for variant pronounciations

# Modify references as needed
# Repeat this for each lexical unit. See abbreviations.html for the abbreviations.
#bscd = u"BCSD '%s'" % traditional
abc = u"ABC '%s'" % pinyin
abc2 = u"ABC '%s' bf 2" % pinyin
bl = u"BL '%s'" % pinyin
ccd1 = u"CCD '%s'" % simplified
ced1 = u"CED '%s'" % simplified
cedict = u"CC-CEDICT '%s'" % traditional
fe1 = u"FE '%s'" % traditional
fe2 = u"FE '%s' 2" % traditional
fgdb = u"FGDB '%s'" % traditional
ghc1 = u"GHC '%s'" % simplified
#gced = u"GCED '%s'" % traditional[0]
gced = u"GCED, p. 1370"
ghdc1 = u"GHDC '%s'" % simplified
hsk1 = u"GHDC '%s' 3" % simplified
k1 = u"Kroll '%s' 1" % traditional
mhy = u"http://www.masterhsingyun.org, accessed 2016-12-11"
ncced1 = u"NCCED '%s'" % simplified
ncced2 = u"NCCED '%s' 2" % simplified
sun = "Sun 2006, loc. 1582"
tan = "Tan 1996, pp. 29-30"
tgn = "TGN ID: 8201340"
wiki = "Wikipedia 'Sui dynasty'"
mw = u"MW 'upāya'"
# add more references as needed
refArr = [[cedict]
         ]

# Print entries to standard out
for i in range(len(english)):
  luid = headword + i
  ref = u""
  for r in refArr[i]:
    if r != "":
      ref += r + "; "
  ref = re.sub("; $", "", ref)
  if ref != "":
    ref = u"(%s)" % ref

  notes = u"%s%s" % (note[i], ref)
  print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, trad, pinyin, english[i], grammar[i], 
      concept[i], domain[i], subdomain, notes, headword)

52096	市中心	\N	shìzhōngxīn	city center / downtown	phrase	\N	\N	地理	Geography	\N	\N	\N	\N	(CC-CEDICT '市中心')	52096
