# Notebook for exploring the Unihan Database

In [31]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [32]:
chinese = u'鑫'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
135258,U+946B,kCantonese,jam1
135259,U+946B,kDefinition,used in names
135260,U+946B,kHanyuPinyin,"64272.160:xīn,xùn"
135261,U+946B,kJapaneseOn,KIN KON KUN
135262,U+946B,kMandarin,xīn
135263,U+946B,kXHC1983,1281.030:xīn


In [33]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value


In [34]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

used in names
xīn

鑫
noun
45706	鑫	\N	xīn	used in names	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '鑫')	45706


In [35]:
# Generate lexical entries for the characters in file unknown.txt
luid = 45891
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+946B,鑫
1,U+829B,芛
2,U+761C,瘜
3,U+555D,啝
4,U+56C4,囄
5,U+588B,墋
6,U+5622,嘢
7,U+6014,怔
8,U+9198,醘
9,U+3021,〡


In [36]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+946B,鑫,kCantonese,jam1
1,U+946B,鑫,kDefinition,used in names
2,U+946B,鑫,kHanyuPinyin,"64272.160:xīn,xùn"
3,U+946B,鑫,kJapaneseOn,KIN KON KUN
4,U+946B,鑫,kMandarin,xīn
5,U+946B,鑫,kXHC1983,1281.030:xīn
6,U+829B,芛,kHanyuPinyin,53185.020:wěi
7,U+829B,芛,kJapaneseOn,I SHUN SHUTSU JUCHI ITSU ICHI
8,U+829B,芛,kMandarin,wěi
9,U+761C,瘜,kCantonese,sik1


In [37]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+25646,,,,,xiàn,
U+4C77,"(same as 魚 漁) to fish, to seize",,,,yú,
U+54CA,,,,,yòu,
U+5545,,,,THAK,zhuó,
U+555D,(Cant.) final partical expressing surprise,,,WA,hé,
U+5575,phonetic,,,,bō,
U+55B0,"to eat, drink",식,,SIK,cān,
U+5622,(Cant.) thing,,,,yě,
U+56C4,(Cant.) to come,,,,lí,
U+588B,,,,,chěn,


In [38]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+25646,\N,\N,\N,\N,xiàn,\N
U+4C77,"(same as 魚 漁) to fish, to seize",\N,\N,\N,yú,\N
U+54CA,\N,\N,\N,\N,yòu,\N
U+5545,\N,\N,\N,THAK,zhuó,\N
U+555D,(Cant.) final partical expressing surprise,\N,\N,WA,hé,\N
U+5575,phonetic,\N,\N,\N,bō,\N
U+55B0,"to eat, drink",식,\N,SIK,cān,\N
U+5622,(Cant.) thing,\N,\N,\N,yě,\N
U+56C4,(Cant.) to come,\N,\N,\N,lí,\N
U+588B,\N,\N,\N,\N,chěn,\N


In [39]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
0,U+946B,鑫,used in names,\N,\N,\N,xīn,\N
1,U+829B,芛,\N,\N,\N,\N,wěi,\N
2,U+761C,瘜,a polypus,\N,\N,\N,xī,\N
3,U+555D,啝,(Cant.) final partical expressing surprise,\N,\N,WA,hé,\N
4,U+56C4,囄,(Cant.) to come,\N,\N,\N,lí,\N
5,U+588B,墋,\N,\N,\N,\N,chěn,\N
6,U+5622,嘢,(Cant.) thing,\N,\N,\N,yě,\N
7,U+6014,怔,a disease resembling neurosis,정,zhēng(15),CENG,zhēng,\N
8,U+9198,醘,\N,\N,\N,\N,kē,\N
9,U+3021,〡,,,,,,


In [40]:
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['grammar']

0     noun
1       \N
2     noun
3     noun
4     noun
5       \N
6     noun
7     noun
8       \N
9     noun
10      \N
11    noun
12    noun
13    noun
14      \N
15      \N
16      \N
17      \N
18    noun
19    noun
20    noun
21    noun
22    noun
23      \N
24    noun
25      \N
26    noun
27      \N
28    noun
29    noun
30    noun
31    noun
32    noun
33      \N
34    noun
35    noun
36    noun
37    noun
38      \N
39      \N
40    noun
41    noun
42    noun
43    noun
44    noun
45      \N
46      \N
47      \N
48    noun
49      \N
Name: grammar, dtype: object

In [41]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang,grammar,id,...,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+946B,鑫,used in names,\N,\N,\N,xīn,\N,noun,45891,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鑫'),45891
1,U+829B,芛,\N,\N,\N,\N,wěi,\N,\N,45892,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '芛'),45892
2,U+761C,瘜,a polypus,\N,\N,\N,xī,\N,noun,45893,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '瘜'),45893
3,U+555D,啝,(Cant.) final partical expressing surprise,\N,\N,WA,hé,\N,noun,45894,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '啝'),45894
4,U+56C4,囄,(Cant.) to come,\N,\N,\N,lí,\N,noun,45895,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '囄'),45895
5,U+588B,墋,\N,\N,\N,\N,chěn,\N,\N,45896,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '墋'),45896
6,U+5622,嘢,(Cant.) thing,\N,\N,\N,yě,\N,noun,45897,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '嘢'),45897
7,U+6014,怔,a disease resembling neurosis,정,zhēng(15),CENG,zhēng,\N,noun,45898,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '怔'),45898
8,U+9198,醘,\N,\N,\N,\N,kē,\N,\N,45899,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '醘'),45899
9,U+3021,〡,,,,,,,noun,45900,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '〡'),45900


In [42]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

45891	鑫	\N	xīn	used in names	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鑫')	45891
45892	芛	\N	wěi	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '芛')	45892
45893	瘜	\N	xī	a polypus	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '瘜')	45893
45894	啝	\N	hé	(Cant.) final partical expressing surprise	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '啝')	45894
45895	囄	\N	lí	(Cant.) to come	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '囄')	45895
45896	墋	\N	chěn	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '墋')	45896
45897	嘢	\N	yě	(Cant.) thing	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '嘢')	45897
45898	怔	\N	zhēng	a disease resembling neurosis	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '怔')	45898
45899	醘	\N	kē	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '醘')	45899
45900	〡	\N			noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '〡')	45900
45901	鐹	\N	guǒ	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鐹')	45901
45902	鉋	\N	bào	