# Notebook to add dictionary entries

In [233]:
import re
import pandas as pd
import numpy as np

# Load lexical units table
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
df = pd.read_csv('../../buddhist-dictionary/data/dictionary/words.txt', names=colnames,
                 dtype=types,  sep='\t', comment='#')
# Words with no notes
df.ix[df.notes == '\\N'][:5]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Unnamed: 0,id,simplified,traditional,pinyin,english,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,image,mp3,notes,headword
2959,2961,合法,\N,héfǎ,lawful / legitimate / legal,adjective,\N,\N,现代汉语,Modern Chinese,法律,Law,\N,he2fa3.mp3,\N,2961
2960,2962,权益,權益,quányì,rights and benefits,noun,\N,\N,现代汉语,Modern Chinese,法律,Law,\N,quan2yi4.mp3,\N,2962
2962,2964,监督,監督,jiāndū,to control / to supervise / to inspect,verb,\N,\N,现代汉语,Modern Chinese,工作,Work,\N,jian1du1.mp3,\N,2964
2963,2965,廉洁,廉潔,liánjié,honest,adjective,\N,\N,现代汉语,Modern Chinese,道德,Morality,\N,lian2jie2.mp3,\N,2965
2964,2966,评价,評價,píngjià,evaluation / appreciation / appraisal,noun,\N,\N,现代汉语,Modern Chinese,思想,Thought,\N,ping2jia4.mp3,\N,2966


In [234]:
#Add a new entry
# Find simplified and pinyin from a traditional text string

#Input
tradArr = ["樂", "而", "不", "淫"]
english = [u"joyful without being licentious"]
grammar = ["set phrase"]
# Concept
empty = u"\\N\t\\N"
animal = u"动物\tAnimal"
artist = u"艺术家\tArtist"
association = u"协会\tAssociation"
bird = u"鸟\tBird"
book = u"书名\tBook Title"
character = u"主角\tCharacter"
city = u"城市\tCity"
cl = "类\tClass"
clan = u"氏\tClan"
color = u"颜色\tColor"
company = u"公司\tCompany"
compound = u"化合物\tCompound"
constellation = u"星座\tConstellation"
county = u"县\tCounty"
country = u"国家\tCountry"
deity = u"神\tDeity"
discipline = u"学科\tDiscipline"
distilled = u"蒸餾酒\tDistilled Beverage"
district = u"地区\tDistrict"
dynasty = u"朝代\tDynasty"
emperor = u"黄帝\tEmperor"
empress = u"皇后\tEmpress"
epoch = u"世\tEpoch"
family = u"科\tFamily"
festival = u"节日\tFestival"
fish = u"鱼\tFish"
flower = u"花\tFlower"
fruit = u"水果\tFruit"
gemstone = u"宝石\tGemstone"
genus = u"属\tGenus"
geo_period = u"地质年代\tGeological Period"
given = u"姓名\tGiven Name"
heaven = u"天\tHeaven"
industry = u"产业\tIndustry"
insect = u"昆虫\tInsect"
island = u"岛\tIsland"
kingdom = u"王国\tKingdom"
lake = u"湖\tLake"
language_family = u"语系\tLanguage Family"
language = u"语言\tLanguage"
library = u"图书馆\tLibrary"
mineral = u"矿物\tMineral"
minority = u"民族\tEthnic Group"
monarch = u"君主\tMonarch"
monastic = u"法师\tMonastic"
mountain = u"山\tMountain"
mountain_range = u"山脉\tMountain Range"
movie = u"电影\tMovie"
newspaper = u"报纸\tNewspaper"
np = u"国家公园\tNational Park"
order = u"目\tOrder"
organization = u"团体\tOrganization"
peak = u"峰\tPeak"
person = u"人\tPerson"
place = u"地名\tPlace Name"
plant = u"植物\tPlant"
poet = u"诗人\tPoet"
politician = u"政治家\tPolitician"
posthumous = u"谥号\tPosthumous Title"
prefecture = u"郡\tPrefecture"
province = u"省\tProvince"
publisher = u"出版者\tPublisher"
reign = u"年号\tReign Name"
river = u"水名\tRiver"
scholar = u"学者\tScholar"
school = u"宗\tSchool"
scientist = u"科学家\tScientist"
sect = u"派别\tSect"
spice = u"调料\tSpice"
state = u"州\tState"
subphylum = u"亚门\tSubphylum"
surname = u"姓氏\tSurname"
temple = u"寺院\tTemple"
tian = u"天\tDeity"
title = u"官名\tOfficial Title"
tree = u"树\tTree"
tv_series = u"电视剧\tTV Series"
vegetable = u"蔬菜\tVegetable"
university = u"大学\tUniversity"
warlord = u"军阀\tWarlord"
writer = u"作者\tAuthor"
concept = [empty]

# Domain
art = u"艺术\tArt"
buddhism = u"佛教\tBuddhism"
commerce = u"商务\tCommerce"
common = u"俗语\tCommon Saying"
compsci = u"计算机学\tComputer Science"
drama = u"戏剧\tDrama"
education = u"教育\tEducation"
history = u"历史\tHistory"
idiom = u"成语\tIdiom"
linguistics = u"语言学\tLinguistics"
literary = u"文言文\tLiterary Chinese"
literature = u"文学\tLiterature"
modern = u"现代汉语\tModern Chinese"
places = u"地方\tPlaces"
politics = u"政治\tPolitics"
proverb = u"谚语\tProverb"
domain = [idiom]

subdomain = u"\\N\t\\N"
#subdomain = u"能力\tAbility"
#subdomain = u"行为\tActions"
#subdomain = u"管理\tAdministration"
#subdomain = u"农业\tAgriculture"
#subdomain = u"人类学\tAnthropology"
#subdomain = u"建筑学\tArchitecture"
#subdomain = u"天文\tAstronomy"
#subdomain = u"占星术\tAstrology"
#subdomain = u"航空\tAviation"
#subdomain = u"澳大利亚\tAustralia"
#subdomain = u"美丽\tBeauty"
#subdomain = u"生物学\tBiology"
#subdomain = u"植物学\tBotany"
#subdomain = u"书法\tCalligraphy"
#subdomain = u"加拿大\tCanada"
#subdomain = u"禅宗\tChan"
#subdomain = u"变化\tChange"
#subdomain = u"特点\tCharacteristic"
#subdomain = u"化学\tChemistry"
#subdomain = u"中国\tChina"
#subdomain = u"中医\tChinese Medicine"
#subdomain = u"基督教\tChristianity"
#subdomain = u"土木\tCivil Engineering"
#subdomain = u"服装\tClothing"
#subdomain = u"颜色\tColor"
#subdomain = u"通讯\tCommunications"
#subdomain = u"比较\tComparison"
#subdomain = u"计算机图形\tComputer Graphics"
#subdomain = u"状况\tCondition"
#subdomain = u"冲突\tConflict"
#subdomain = u"儒家\tConfucianism"
#subdomain=u"容器\tContainer"
#subdomain = u"文化\tCulture"
#subdomain = u"习俗\tCustoms"
#subdomain = u"跳舞\tDancing"
#subdomain = u"方言\tDialect"
#subdomain = u"灾难\tDisaster"
#subdomain = u"戏剧\tDrama"
#subdomain = u"地球科学\tEarth Sciences"
#subdomain = u"电机工程学\tElectrical Engineering"
#subdomain = u"环境\tEnvironment"
#subdomain = u"经济\tEconomics"
#subdomain = u"教育\tEducation"
#subdomain = u"电器\tElectrical Appliances"
#subdomain = u"电\tElectricity"
#subdomain = u"电子工程\tElectronic Engineering"
#subdomain = u"感情\tEmotion"
#subdomain = u"英国\tEngland"
#subdomain = u"昆虫学\tEntomology"
#subdomain = u"欧洲\tEurope"
#subdomain = u"家\tFamily"
#subdomain = u"火\tFire"
#subdomain = u"财会\tFinance and Accounting"
#subdomain = u"饮食\tFood and Drink"
#subdomain = u"形态\tForm"
#subdomain = u"友谊\tFriendship"
#subdomain = u"虚词\tFunction Words"
#subdomain = u"地理\tGeography"
#subdomain = u"地质\tGeology"
#subdomain = u"政府\tGovernment"
#subdomain = u"几何\tGeometry"
#subdomain = u"语法\tGrammar"
#subdomain = u"帮助\tHelp"
#subdomain = u"健康\tHealth"
#subdomain = u"住房\tHousing"
#subdomain = u"幽默\tHumor"
#subdomain = u"卫生\tHygiene"
#subdomain = u"鱼类学\tIchthyology"
#subdomain = u"印度\tIndia"
#subdomain = u"产业\tIndustry"
#subdomain = u"信息\tInformation"
#subdomain = u"信息学\tInformation Science"
#subdomain = u"信息技术\tInformation Technology"
#subdomain = u"回教\tIslam"
#subdomain = u"配饰\tJewelry"
#subdomain = u"日本\tJapan"
#subdomain = u"韩国\tKorea"
#subdomain = u"语言\tLanguage"
#subdomain = u"法律\tLaw"
#subdomain = u"光纤\tLight"
#subdomain = u"休闲\tLeisure"
#subdomain = u"图书馆学\tLibrary Science"
#subdomain = u"生活\tLife"
#subdomain = u"逻辑\tLogic"
#subdomain = u"法术\tMagic"
#subdomain = u"制造业\tManufacturing"
#subdomain  =u"武术\tMartial Arts"
#subdomain = u"材料科学\tMaterials Science"
#subdomain = u"数学\tMathematics"
#subdomain = u"测控\tMeasurement and Control"
#subdomain = u"机械\tMechanical"
#subdomain = u"力学\tMechanics"
#subdomain = u"媒体\tMedia"
#subdomain = u"医疗\tMedicine"
#subdomain = u"气象学\tMeteorology"
#subdomain = u"军事\tMilitary"
#subdomain = u"道德\tMorality"
#subdomain = u"活动\tMovement"
#subdomain = u"音乐\tMusic"
#subdomain = u"神话\tMythology"
#subdomain = u"名字\tNames"
#subdomain = u"大自然\tNature"
#subdomain = u"网络\tNetworking"
#subdomain = u"观察\tObservation"
#subdomain = u"光学\tOptics"
#subdomain = u"有机化学\tOrganic Chemistry"
#subdomain = u"鸟类学\tOrnithology"
#subdomain = u"组织\tOrganization"
#subdomain = u"表演艺术 Performing Arts"
#subdomain = u"人\tPeople"
#subdomain = u"哲学\tPhilosophy"
#subdomain = u"语音学\tPhonetics"
#subdomain = u"摄影术\tPhotography"
#subdomain = u"物理\tPhysics"
#subdomain = u"位\tPosition"
#subdomain = u"过程\tProcess"
#subdomain = u"心理学\tPsychology"
#subdomain = u"公安\tPublic Security"
#subdomain = u"出版\tPublishing"
#subdomain = u"数量\tQuantity"
#subdomain = u"研究\tResearch"
#subdomain = u"宗教\tReligion"
#subdomain = u"规模\tScale"
#subdomain = u"科学\tScience"
#subdomain = u"交际\tSocial Interaction"
#subdomain = u"社会\tSociety"
#subdomain = u"社会学\tSocial Science"
#subdomain = u"声\tSound"
#subdomain = u"航天\tSpace Flight"
#subdomain = u"体育\tSport"
#subdomain = u"文具\tStationery"
#subdomain = u"统计学\tStatistics"
#subdomain = u"力气\tStrength"
#subdomain = u"唐\tTang"
#subdomain = u"道教\tTaoism"
#subdomain = u"交通\tTransportation"
#subdomain = u"温度\tTemperature"
#subdomain = u"思想\tThought"
#subdomain = u"时间\tTime"
#subdomain = u"技术\tTechnology"
#subdomain = u"工具\tTools"
#subdomain = u"美国\tUnited States"
#subdomain = u"暴力\tViolence"
#subdomain = u"船\tWatercraft"
#subdomain = u"水\tWater"
#subdomain = u"武器\tWeapons"
#subdomain = u"财富\tWealth"
#subdomain = u"写作\tWriting"
#subdomain = u"工作\tWork"
#subdomain = u"动物学\tZoology"
note = [u""
       ]

# Generated
luid = df.count()[0] + 2 #np.amax(df.index.values)[0] + 1
print(luid)
headword = luid
traditional = ""
simplified = ""
pinyin = ""
for t in tradArr:
  sDF = df[df.traditional == t]['simplified']
  tDF = df[df.traditional == t]['traditional']
  pDF = df[df.traditional == t]['pinyin']
  if sDF.count() > 0:
    simplified += sDF.iloc[0]
    traditional +=tDF.iloc[0]
    pinyin += pDF.iloc[0] + " "
  else:
    sDF = df[df.simplified == t]['simplified']
    tDF = df[df.simplified == t]['simplified']
    pDF = df[df.simplified == t]['pinyin']
    if sDF.count() > 0:
      simplified += sDF.iloc[0]
      traditional += sDF.iloc[0]
      pinyin += pDF.iloc[0] + " "
    else:
      print("%s not found" % t)
pinyin.strip()
trad = traditional
if simplified == traditional:
  trad = "\\N"
pinyin = pinyin.replace(" ", "")
print(simplified)
print(trad)
print(pinyin)
print("English: %d" % len(english))
print("Grammar: %d" % len(grammar))
print("Concept: %d" % len(concept))
print("Domain: %d" % len(domain))
print("Note: %d" % len(note))

107686
乐而不淫
樂而不淫
lèérbùyín
English: 1
Grammar: 1
Concept: 1
Domain: 1
Note: 1


In [235]:
# Input - 
#pinyin = u"tài" # override for variant pronounciations

# Modify references as needed
# Repeat this for each lexical unit. See abbreviations.html for the abbreviations.
#bscd = u"BCSD '%s'" % traditional
aat = u"AAT '%s'" % english[0]
abc1 = u"ABC '%s' %s 1" % (pinyin, simplified)
abc2 = u"ABC '%s' %s 2" % (pinyin, simplified)
bl = u"BL '%s'" % pinyin
bcsd = u"BCSD '%s'" % traditional
bar = u"BAR '%s'" % english[0]
bussmann = u"BUS '%s'" % english[0]
cbd = u"CBD '%s'" % english[0]
ccd1 = u"CCD '%s'" % simplified
ccd2 = u"CCD '%s' 2" % simplified
cci = u"CCI p. 43"
ced1 = u"CED '%s' 1" % simplified
cedb = u"CEDB '%s'" % simplified
cedict = u"CC-CEDICT '%s'" % traditional
custom = u"T 1666"
dcp = u"DCP 'Z137'"
ecdi = u"ECDI '%s'" % english[0]
fe1 = u"FE '%s'" % traditional
fe2 = u"FE '%s' 2" % traditional
fgdb = u"FGDB '%s'" % traditional
ghc1 = u"GHC '%s'" % simplified
ghc2 = u"GHC '%s' 2" % simplified
ghc3 = u"GHC '%s' 3" % simplified
#gced = u"GCED '%s'" % traditional[0]
ghdc1 = u"GHDC '%s'" % simplified
glsib = u"GLSIB, p. 114"
guoyu = u"Guoyu '%s'" % traditional
handian = u"Han Dian '%s'" % simplified
hsk1 = u"GHDC '%s' 3" % simplified
jebd = u"JEBD '%s'" % english[0]
k1 = u"Kroll '%s'" % traditional
mw = "MW '%s'" % english[0]
ncced1 = u"NCCED '%s'" % simplified
ncced2 = u"NCCED '%s' 2" % simplified
ncced3 = u"NCCED '%s' 3" % simplified
ncced4 = u"NCCED '%s' 4" % simplified
seekers = u"SEEKERS '%s'" % english
soothill = u"SH '%s'" % traditional
sun = "Sun 2006, loc. 1582"
tan = "Tan 1996, pp. 29-30"
tgn = "TGN ID: 8201340"
wiki = "Wikipedia '%s'" % traditional
wikt = "Wiktionary '%s'" % simplified
worldcat = "Worldcat Identities '%s'" % simplified
yixing = u"Yixing"
# add more references as needed
refArr = [[cedict, guoyu, handian, wiki]]

# Print entries to standard out
for i in range(len(english)):
  luid = headword + i
  ref = u""
  for r in refArr[i]:
    if r != "":
      ref += r + "; "
  ref = re.sub("; $", "", ref)
  if ref != "":
    ref = u"(%s)" % ref

  notes = u"%s%s" % (note[i], ref)
  print(u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, trad, pinyin, english[i], grammar[i], 
      concept[i], domain[i], subdomain, notes, headword))

107686	乐而不淫	樂而不淫	lèérbùyín	joyful without being licentious	set phrase	\N	\N	成语	Idiom	\N	\N	\N	\N	(CC-CEDICT '樂而不淫'; Guoyu '樂而不淫'; Han Dian '乐而不淫'; Wikipedia '樂而不淫')	107686
