# Notebook for exploring the Unihan Database

In [13]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [14]:
chinese = u'呤'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
34388,U+5464,kCantonese,ling4
34389,U+5464,kDefinition,purine
34390,U+5464,kHangul,령
34391,U+5464,kHanyuPinyin,"10606.020:líng,lìng"
34392,U+5464,kJapaneseKun,SASAYAKI
34393,U+5464,kJapaneseOn,REI RYOU
34394,U+5464,kKorean,LYENG
34395,U+5464,kMandarin,lìng
34396,U+5464,kVietnamese,gầm
34397,U+5464,kXHC1983,0724.010:lìng


In [15]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value


In [16]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

purine
lìng

呤
noun
45706	呤	\N	lìng	purine	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '呤')	45706


In [17]:
# Generate lexical entries for the characters in file unknown.txt
luid = 45959
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+5464,呤
1,U+98BC,颼
2,U+5246,剆
3,U+8E26,踦
4,U+568C,嚌
5,U+6D91,涑
6,U+8B6B,譫
7,U+99BA,馺
8,U+4EDB,仛
9,U+5432,吲


In [18]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+5464,呤,kCantonese,ling4
1,U+5464,呤,kDefinition,purine
2,U+5464,呤,kHangul,령
3,U+5464,呤,kHanyuPinyin,"10606.020:líng,lìng"
4,U+5464,呤,kJapaneseKun,SASAYAKI
5,U+5464,呤,kJapaneseOn,REI RYOU
6,U+5464,呤,kKorean,LYENG
7,U+5464,呤,kMandarin,lìng
8,U+5464,呤,kVietnamese,gầm
9,U+5464,呤,kXHC1983,0724.010:lìng


In [19]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,kHangul,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U+249D5,,,,yì,
U+4EDB,"young girl; strange, different",,CHA THAK,tuō,
U+4EF3,"separate, part company",비,PI,pǐ,
U+511A,"(J) equivalent to 果敢 U+679C U+6562, fleeting, ...",맹,MENG,méng,
U+5246,,,,luǒ,
U+5432,smile at; sneer at,,SIN,yǐn,
U+5464,purine,령,LYENG,lìng,
U+549C,to scold,,THA,ta,
U+55BF,chirping of birds,,SO,zào,
U+568C,to sip; (Cant.) aspect marker of excessive extent,,CEY,jì,


In [20]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U+249D5,\N,\N,\N,yì,\N
U+4EDB,"young girl; strange, different",\N,CHA THAK,tuō,\N
U+4EF3,"separate, part company",비,PI,pǐ,\N
U+511A,"(J) equivalent to 果敢 U+679C U+6562, fleeting, ...",맹,MENG,méng,\N
U+5246,\N,\N,\N,luǒ,\N
U+5432,smile at; sneer at,\N,SIN,yǐn,\N
U+5464,purine,령,LYENG,lìng,\N
U+549C,to scold,\N,THA,ta,\N
U+55BF,chirping of birds,\N,SO,zào,\N
U+568C,to sip; (Cant.) aspect marker of excessive extent,\N,CEY,jì,\N


In [21]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin,kTang
0,U+5464,呤,purine,령,LYENG,lìng,\N
1,U+98BC,颼,sound of wind; blow chilly,수,SWU,sōu,shriou
2,U+5246,剆,\N,\N,\N,luǒ,\N
3,U+8E26,踦,the shin; to pierce; to touch,\N,\N,yǐ,\N
4,U+568C,嚌,to sip; (Cant.) aspect marker of excessive extent,\N,CEY,jì,\N
5,U+6D91,涑,river in Shansi province,속,SO,sù,\N
6,U+8B6B,譫,talkative; incoherent talk,섬,SEM,zhān,\N
7,U+99BA,馺,\N,\N,\N,sà,\N
8,U+4EDB,仛,"young girl; strange, different",\N,CHA THAK,tuō,\N
9,U+5432,吲,smile at; sneer at,\N,SIN,yǐn,\N


In [22]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kKorean,pinyin,kTang,id,traditional,grammar,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+5464,呤,purine,령,LYENG,lìng,\N,45959,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '呤'),45959
1,U+98BC,颼,sound of wind; blow chilly,수,SWU,sōu,shriou,45960,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '颼'),45960
2,U+5246,剆,\N,\N,\N,luǒ,\N,45961,\N,\N,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '剆'),45961
3,U+8E26,踦,the shin; to pierce; to touch,\N,\N,yǐ,\N,45962,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '踦'),45962
4,U+568C,嚌,to sip; (Cant.) aspect marker of excessive extent,\N,CEY,jì,\N,45963,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '嚌'),45963
5,U+6D91,涑,river in Shansi province,속,SO,sù,\N,45964,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '涑'),45964
6,U+8B6B,譫,talkative; incoherent talk,섬,SEM,zhān,\N,45965,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '譫'),45965
7,U+99BA,馺,\N,\N,\N,sà,\N,45966,\N,\N,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '馺'),45966
8,U+4EDB,仛,"young girl; strange, different",\N,CHA THAK,tuō,\N,45967,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '仛'),45967
9,U+5432,吲,smile at; sneer at,\N,SIN,yǐn,\N,45968,\N,noun,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '吲'),45968


In [23]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

45959	呤	\N	lìng	purine	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '呤')	45959
45960	颼	\N	sōu	sound of wind; blow chilly	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '颼')	45960
45961	剆	\N	luǒ	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '剆')	45961
45962	踦	\N	yǐ	the shin; to pierce; to touch	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '踦')	45962
45963	嚌	\N	jì	to sip; (Cant.) aspect marker of excessive extent	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '嚌')	45963
45964	涑	\N	sù	river in Shansi province	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '涑')	45964
45965	譫	\N	zhān	talkative; incoherent talk	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '譫')	45965
45966	馺	\N	sà	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '馺')	45966
45967	仛	\N	tuō	young girl; strange, different	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '仛')	45967
45968	吲	\N	yǐn	smile at; sneer at	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '吲')	45968
4596