# Notebook to add or modify dictionary entries

In [93]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Load lexical units table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
df = pd.read_table('../data/words.txt', names=colnames, dtype=types)
# Words with no notes
df.ix[df.notes == '\N'][:10]

Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
286,287,二月,\N,èryuè,February / the Second Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,287
287,288,三月,\N,sānyuè,March / the Third Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,288
288,289,四月,\N,sìyuè,April / the Fourth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,289
289,290,五月,\N,wǔyuè,May / the Fifth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,290
290,291,六月,\N,liùyuè,June / the Sixth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,291
291,292,七月,\N,qīyuè,July / the Seventh Month,proper noun,月份,month,时间,Time,日历,calendar,\N,qi1yue4.mp3,\N,292
292,293,八月,\N,bāyuè,August / the Eighth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,293
293,294,九月,\N,jiǔyuè,September / the Ninth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,294
294,295,十月,\N,shíyuè,October / the Tenth Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,295
295,296,十一月,\N,shíyīyuè,November / the Eleventh Month,proper noun,月份,month,时间,Time,日历,calendar,\N,\N,\N,296


In [None]:
## Lookup an entry

## Add a new entry

In [87]:
cols = ["id", "simplified", "traditional", "pinyin", "english", "notes"]
df[df.simplified == 'yi yue'][cols]

Unnamed: 0,id,simplified,traditional,pinyin,english,notes


In [89]:
# Find simplified and pinyin from a traditional text string

#Input
headword = 46047
tradArr = ["一", "有"]
english = [u"as soon as there is"
          ]
grammar = ["phrase"
          ]
empty = u"\\N\t\\N"
concept = [empty
          ]
classical = u"古文\tClassical Chinese"
family = u"家\tFamily"
function = u"虚词\tFunction Words"
position = u"位\tPosition"
sound = u"声\tSound"
time = u"时间\tTime"
domain = [time]
subdomain = u"\\N\t\\N"
note = [u""
       ]

# Generated
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0].decode('utf-8')
    traditional +=tDF.iloc[0].decode('utf-8')
    pinyin += pDF.iloc[0].decode('utf-8') + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0].decode('utf-8')
      traditional += sDF.iloc[0].decode('utf-8')
      pinyin += pDF.iloc[0].decode('utf-8') + " "
    else:
      print "%s not found" % t
pinyin.strip()
trad = traditional
if simplified == traditional:
  trad = "\\N"
pinyin = pinyin.replace(" ", "")
print simplified
print trad
print pinyin
print "English: %d" % len(english)
print "Grammar: %d" % len(grammar)
print "Concept: %d" % len(concept)
print "Domain: %d" % len(domain)
print "Note: %d" % len(note)

一有
\N
yīyǒu
English: 1
Grammar: 1
Concept: 1
Domain: 1
Note: 1


In [90]:
# Input - 
#pinyin = u"tài" # override for variant pronounciations

# Modify references as needed
# Repeat this for each lexical unit. See abbreviations.html for the abbreviations.
#bscd = u"BCSD '%s'" % traditional
abc1 = u"ABC '%s' bf 1" % pinyin

ccd1 = u"CCD '%s' 4" % simplified

ced1 = u"CED '%s' 1" % simplified

fe1 = u"FE '%s' 1" % traditional
fe2 = u"FE '%s' 2" % traditional

ghc1 = u"GHC '%s' 1" % simplified

gced = u"GCED '%s'" % traditional[0]

ghdc1 = u"GHDC '%s' 1" % simplified

#fgdb = u"FGDB '%s'" % traditional

k1 = u"Kroll '%s' 1" % traditional

hsk1 = u"GHDC '%s' 3" % simplified

ncced1 = u"NCCED '%s' 1" % simplified
#mw = u"MW 'upāya'"
# add more references as needed
refArr = [[gced]
         ]

# Print entries to standard out
for i in range(len(english)):
  luid = headword + i
  ref = u""
  for r in refArr[i]:
    if r != "":
      ref += r + "; "
  ref = re.sub("; $", "", ref)
  if ref != "":
    ref = u"(%s)" % ref

  notes = u"%s%s" % (note[i], ref)
  print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, trad, pinyin, english[i], grammar[i], 
      concept[i], domain[i], subdomain, notes, headword)

46047	一有	\N	yīyǒu	as soon as there is	phrase	\N	\N	时间	Time	\N	\N	\N	\N	(GCED '一')	46047
