# Notebook for exploring the Unihan Database

In [2]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings['codepoint'].count() # Number of rows

187561

In [3]:
# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
variants['codepoint'].count() # Number of rows

12375

In [4]:
chinese = u'鏡'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
134586,U+93E1,kCantonese,geng3
134587,U+93E1,kDefinition,mirror; lens; glass; glasses
134588,U+93E1,kHangul,경
134589,U+93E1,kHanyuPinlu,jìng(285)
134590,U+93E1,kHanyuPinyin,64250.100:jìng
134591,U+93E1,kJapaneseKun,KAGAMI
134592,U+93E1,kJapaneseOn,KYOU KEI
134593,U+93E1,kKorean,KYENG
134594,U+93E1,kMandarin,jìng
134595,U+93E1,kTang,*giæ̀ng


In [5]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
english

'mirror; lens; glass; glasses'

In [6]:
pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

jìng


In [7]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value
9353,U+93E1,kSimplifiedVariant,U+955C


In [8]:
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes




In [9]:
simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

镜


In [10]:
grammar = "\\N"
if english != "\\N":
    grammar = "noun"
grammar

'noun'

In [11]:
luid = 45678
notes = "%s (Unihan '%s')" % (notes, chinese)
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t古文\tClassical Chinese\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, notes, luid)

9402	镜	鏡	jìng	mirror; lens; glass; glasses	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '鏡')	9402
