# Notebook for exploring the Unihan Database

In [21]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [2]:
chinese = u'釴'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
131959,U+91F4,kHanyuPinyin,64171.070:yì
131960,U+91F4,kMandarin,yì


In [3]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value


In [4]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

\N
yì

釴
\N
45706	釴	\N	yì	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '釴')	45706


In [14]:
# Generate lexical entries for the characters in file unknown.txt
luid = 45805
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+91F4,釴
1,U+61DE,懞
2,U+5AAE,媮
3,U+76CB,盋
4,U+8E5A,蹚
5,U+8E32,踲
6,U+7A4A,穊
7,U+7A82,窂
8,U+5097,傗


In [15]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+91F4,釴,kHanyuPinyin,64171.070:yì
1,U+91F4,釴,kMandarin,yì
2,U+61DE,懞,kCantonese,mung4
3,U+61DE,懞,kDefinition,variant of U+8499 蒙
4,U+61DE,懞,kHangul,몽
5,U+61DE,懞,kHanyuPinyin,"42360.040:méng,měng"
6,U+61DE,懞,kJapaneseKun,ATSUI
7,U+61DE,懞,kJapaneseOn,BOU MU
8,U+61DE,懞,kKorean,MONG
9,U+61DE,懞,kMandarin,méng


In [16]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,kHangul,kKorean,pinyin
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U+5097,,,,chù
U+5AAE,"handsome, cheerful; steal",,,tōu
U+61DE,variant of U+8499 蒙,몽,MONG,méng
U+76CB,,,,bō
U+7A4A,plough deep; sown slowly,,,jì
U+7A82,,로,LO,láo
U+8E32,,,,dùn
U+8E5A,tread through mud and water,,,tāng
U+91F4,,,,yì


In [17]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kKorean,pinyin
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U+5097,\N,\N,\N,chù
U+5AAE,"handsome, cheerful; steal",\N,\N,tōu
U+61DE,variant of U+8499 蒙,몽,MONG,méng
U+76CB,\N,\N,\N,bō
U+7A4A,plough deep; sown slowly,\N,\N,jì
U+7A82,\N,로,LO,láo
U+8E32,\N,\N,\N,dùn
U+8E5A,tread through mud and water,\N,\N,tāng
U+91F4,\N,\N,\N,yì


In [18]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin
0,U+91F4,釴,\N,\N,\N,yì
1,U+61DE,懞,variant of U+8499 蒙,몽,MONG,méng
2,U+5AAE,媮,"handsome, cheerful; steal",\N,\N,tōu
3,U+76CB,盋,\N,\N,\N,bō
4,U+8E5A,蹚,tread through mud and water,\N,\N,tāng
5,U+8E32,踲,\N,\N,\N,dùn
6,U+7A4A,穊,plough deep; sown slowly,\N,\N,jì
7,U+7A82,窂,\N,로,LO,láo
8,U+5097,傗,\N,\N,\N,chù


In [19]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
unknownDF['grammar'] = "verb"
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin,id,traditional,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+91F4,釴,\N,\N,\N,yì,45805,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '釴'),45805
1,U+61DE,懞,variant of U+8499 蒙,몽,MONG,méng,45806,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '懞'),45806
2,U+5AAE,媮,"handsome, cheerful; steal",\N,\N,tōu,45807,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '媮'),45807
3,U+76CB,盋,\N,\N,\N,bō,45808,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '盋'),45808
4,U+8E5A,蹚,tread through mud and water,\N,\N,tāng,45809,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '蹚'),45809
5,U+8E32,踲,\N,\N,\N,dùn,45810,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '踲'),45810
6,U+7A4A,穊,plough deep; sown slowly,\N,\N,jì,45811,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '穊'),45811
7,U+7A82,窂,\N,로,LO,láo,45812,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '窂'),45812
8,U+5097,傗,\N,\N,\N,chù,45813,\N,verb,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '傗'),45813


In [22]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

45805	釴	\N	yì	\N	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '釴')	45805
45806	懞	\N	méng	variant of U+8499 蒙	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '懞')	45806
45807	媮	\N	tōu	handsome, cheerful; steal	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '媮')	45807
45808	盋	\N	bō	\N	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '盋')	45808
45809	蹚	\N	tāng	tread through mud and water	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '蹚')	45809
45810	踲	\N	dùn	\N	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '踲')	45810
45811	穊	\N	jì	plough deep; sown slowly	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '穊')	45811
45812	窂	\N	láo	\N	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '窂')	45812
45813	傗	\N	chù	\N	verb	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '傗')	45813
