# Notebook for importing from the Unihan Database

In [4]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print(readings['codepoint'].count()) # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print(variants['codepoint'].count()) # Number of rows

readings.tail()

  # This is added back by InteractiveShellApp.init_path()


187561
12375




Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [21]:
chinese = u'黽'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
149974,U+9EFD,kCantonese,man5
149975,U+9EFD,kDefinition,to strive; to endeavor
149976,U+9EFD,kHangul,민
149977,U+9EFD,kHanyuPinyin,"74768.010:měng,mǐn,miǎn,méng"
149978,U+9EFD,kJapaneseKun,TSUTOMERU AOGAERU
149979,U+9EFD,kJapaneseOn,BOU BIN BEN
149980,U+9EFD,kKorean,MIN
149981,U+9EFD,kMandarin,miǎn
149982,U+9EFD,kXHC1983,0785.111:miǎn 0792.081:mǐn


In [3]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value
7609,U+8892,kSemanticVariant,"U+8962<kLau,kMatthews"


In [17]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print(english)

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0]
print(pinyin)

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print(notes)

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print(simplified)

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print(grammar)

luid = 57940
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print(u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid))

spawn; roe; fy
kūn



NameError: name 'unichr' is not defined

In [73]:
# Generate lexical entries for the characters in file unknown.txt
luid = 105519
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+6BFD,毽
1,U+5B1D,嬝
2,U+9329,錩
3,U+F96D,省
4,U+643E,搾
5,U+5613,嘓
6,U+95B0,閰
7,U+5C95,岕
8,U+F91F,蘭
9,U+F961,率


In [74]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+6BFD,毽,kCantonese,gin3 jin2
1,U+6BFD,毽,kDefinition,a shuttlecock
2,U+6BFD,毽,kHanyuPinyin,32001.050:jiàn
3,U+6BFD,毽,kMandarin,jiàn
4,U+6BFD,毽,kXHC1983,0558.030:jiàn
5,U+5B1D,嬝,kCantonese,niu5
6,U+5B1D,嬝,kDefinition,delicate; graceful
7,U+5B1D,嬝,kJapaneseKun,TAOYAKA
8,U+5B1D,嬝,kJapaneseOn,JOU
9,U+5B1D,嬝,kMandarin,niǎo


In [75]:
try:
  pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
  del pivoted["kCantonese"]
  del pivoted["kHanyuPinyin"]
  del pivoted["kJapaneseKun"]
  del pivoted["kJapaneseOn"]
  del pivoted["kVietnamese"]
  del pivoted["kXHC1983"]
  pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
  pivoted
except KeyError as ke:
  print('KeyError, ', ke )

In [76]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kKorean,pinyin
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U+5613,"gurgling sound, chattering",\N,KOYK,guō
U+5692,\N,\N,\N,mē
U+5763,\N,\N,\N,táng
U+5B1D,delicate; graceful,\N,\N,niǎo
U+5C95,\N,\N,\N,jiè
U+6075,"favor, benefit, confer kindness",\N,HYEY,huì
U+643E,"to crush with the hand, press, squeeze, extract",착,CHAK,zhà
U+6BCE,every,\N,MAY,měi
U+6BFD,a shuttlecock,\N,\N,jiàn
U+6C1A,tritium,\N,\N,chuān


In [77]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin
0,U+6BFD,毽,a shuttlecock,\N,\N,jiàn
1,U+5B1D,嬝,delicate; graceful,\N,\N,niǎo
2,U+9329,錩,vessel,\N,\N,chāng
3,U+F96D,省,"province; save, economize",생,SAYNG,\N
4,U+643E,搾,"to crush with the hand, press, squeeze, extract",착,CHAK,zhà
5,U+5613,嘓,"gurgling sound, chattering",\N,KOYK,guō
6,U+95B0,閰,\N,\N,\N,jú
7,U+5C95,岕,\N,\N,\N,jiè
8,U+F91F,蘭,"orchid; elegant, graceful",난,NAN,\N
9,U+F961,率,to lead; ratio; rate; limit,률,LYUL,\N


In [78]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
#getEng = lambda x: x.english if x.english != "" else "\\N"
#unknownDF['english'] = unknownDF.apply(getEng, axis = 1)
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "文言文"
unknownDF['domain_en'] = "Literary Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin,id,traditional,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+6BFD,毽,a shuttlecock,\N,\N,jiàn,105519,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '毽'),105519
1,U+5B1D,嬝,delicate; graceful,\N,\N,niǎo,105520,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '嬝'),105520
2,U+9329,錩,vessel,\N,\N,chāng,105521,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '錩'),105521
3,U+F96D,省,"province; save, economize",생,SAYNG,\N,105522,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '省'),105522
4,U+643E,搾,"to crush with the hand, press, squeeze, extract",착,CHAK,zhà,105523,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '搾'),105523
5,U+5613,嘓,"gurgling sound, chattering",\N,KOYK,guō,105524,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '嘓'),105524
6,U+95B0,閰,\N,\N,\N,jú,105525,\N,\N,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '閰'),105525
7,U+5C95,岕,\N,\N,\N,jiè,105526,\N,\N,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '岕'),105526
8,U+F91F,蘭,"orchid; elegant, graceful",난,NAN,\N,105527,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '蘭'),105527
9,U+F961,率,to lead; ratio; rate; limit,률,LYUL,\N,105528,\N,noun,\N,\N,文言文,Literary Chinese,\N,\N,\N,\N,(Unihan '率'),105528


In [79]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", columns = cols)

105519	毽	\N	jiàn	a shuttlecock	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '毽')	105519
105520	嬝	\N	niǎo	delicate; graceful	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '嬝')	105520
105521	錩	\N	chāng	vessel	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '錩')	105521
105522	省	\N	\N	province; save, economize	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '省')	105522
105523	搾	\N	zhà	to crush with the hand, press, squeeze, extract	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '搾')	105523
105524	嘓	\N	guō	gurgling sound, chattering	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '嘓')	105524
105525	閰	\N	jú	\N	\N	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '閰')	105525
105526	岕	\N	jiè	\N	\N	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '岕')	105526
105527	蘭	\N	\N	orchid; elegant, graceful	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '蘭')	105527
105528	率	\N	\N	to lead; ratio; rate; limit	noun	\N	\N	文言文	Literary Chinese	\N	\N	\N	\N	(Unihan '率')	105528
105