# Notebook for importing from the Unihan Database

In [154]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [158]:
chinese = u'沨'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
74919,U+6CA8,kMandarin,fēng
74920,U+6CA8,kXHC1983,0331.010:fēng


In [159]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value
4372,U+6CA8,kTraditionalVariant,U+6E22
4373,U+6CA8,kZVariant,U+6E22


In [160]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

\N
fēng

沨
\N
45706	沨	\N	fēng	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '沨')	45706


In [161]:
# Generate lexical entries for the characters in file unknown.txt
luid = 56026
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+6CA8,沨
1,U+4694,䚔
2,U+514E,兎
3,U+82AA,芪
4,U+53AC,厬
5,U+729D,犝
6,U+374C,㝌
7,U+2A627,𪘧
8,U+9C1C,鰜
9,U+39DA,㧚


In [162]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+6CA8,沨,kMandarin,fēng
1,U+6CA8,沨,kXHC1983,0331.010:fēng
2,U+4694,䚔,kCantonese,baan1 bin1 paan5
3,U+4694,䚔,kDefinition,to look suddenly; to look shortly
4,U+4694,䚔,kHanyuPinyin,63676.080:bīn
5,U+4694,䚔,kMandarin,bīn
6,U+514E,兎,kCantonese,tou3
7,U+514E,兎,kDefinition,"rabbit, hare"
8,U+514E,兎,kHangul,토
9,U+514E,兎,kJapaneseKun,USAGI


In [163]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+20000,the sound made by breathing in; oh! (cf. U+311...,,,,hē,
U+2010C,,,,,jué,
U+20112,,,,,chuí,
U+202A9,,,,,zhuàn,
U+202FA,,,,,sāo,
U+203EE,,,,,duì,
U+2043F,,,,,hōng,
U+204D7,,,,,fù,
U+204DC,,,,,liǎng,
U+20509,a roll,,,,juàn,


In [164]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+20000,the sound made by breathing in; oh! (cf. U+311...,\N,\N,\N,hē,\N
U+2010C,\N,\N,\N,\N,jué,\N
U+20112,\N,\N,\N,\N,chuí,\N
U+202A9,\N,\N,\N,\N,zhuàn,\N
U+202FA,\N,\N,\N,\N,sāo,\N
U+203EE,\N,\N,\N,\N,duì,\N
U+2043F,\N,\N,\N,\N,hōng,\N
U+204D7,\N,\N,\N,\N,fù,\N
U+204DC,\N,\N,\N,\N,liǎng,\N
U+20509,a roll,\N,\N,\N,juàn,\N


In [165]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
0,U+6CA8,沨,\N,\N,\N,\N,fēng,\N
1,U+4694,䚔,to look suddenly; to look shortly,\N,\N,\N,bīn,\N
2,U+514E,兎,"rabbit, hare",토,\N,THO,tù,\N
3,U+82AA,芪,celery,기,\N,KI,qí,\N
4,U+53AC,厬,\N,\N,\N,KWEY,guǐ,\N
5,U+729D,犝,\N,\N,\N,\N,tóng,\N
6,U+374C,㝌,"(same as 疚) prolonged illness, mental discomfo...",\N,\N,\N,jiù,\N
7,U+2A627,𪘧,\N,\N,\N,\N,zú,\N
8,U+9C1C,鰜,big-mouthed flounder,\N,\N,\N,qiàn,\N
9,U+39DA,㧚,"(a dialect character) cup the hand, (corrupted...",\N,\N,\N,wǎ,\N


In [166]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang,id,traditional,...,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+6CA8,沨,\N,\N,\N,\N,fēng,\N,56026,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '沨'),56026
1,U+4694,䚔,to look suddenly; to look shortly,\N,\N,\N,bīn,\N,56027,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '䚔'),56027
2,U+514E,兎,"rabbit, hare",토,\N,THO,tù,\N,56028,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '兎'),56028
3,U+82AA,芪,celery,기,\N,KI,qí,\N,56029,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '芪'),56029
4,U+53AC,厬,\N,\N,\N,KWEY,guǐ,\N,56030,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '厬'),56030
5,U+729D,犝,\N,\N,\N,\N,tóng,\N,56031,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '犝'),56031
6,U+374C,㝌,"(same as 疚) prolonged illness, mental discomfo...",\N,\N,\N,jiù,\N,56032,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '㝌'),56032
7,U+2A627,𪘧,\N,\N,\N,\N,zú,\N,56033,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '𪘧'),56033
8,U+9C1C,鰜,big-mouthed flounder,\N,\N,\N,qiàn,\N,56034,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鰜'),56034
9,U+39DA,㧚,"(a dialect character) cup the hand, (corrupted...",\N,\N,\N,wǎ,\N,56035,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '㧚'),56035


In [167]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

56026	沨	\N	fēng	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '沨')	56026
56027	䚔	\N	bīn	to look suddenly; to look shortly	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '䚔')	56027
56028	兎	\N	tù	rabbit, hare	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '兎')	56028
56029	芪	\N	qí	celery	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '芪')	56029
56030	厬	\N	guǐ	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '厬')	56030
56031	犝	\N	tóng	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '犝')	56031
56032	㝌	\N	jiù	(same as 疚) prolonged illness, mental discomfort, to stay in one place for a long period	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '㝌')	56032
56033	𪘧	\N	zú	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '𪘧')	56033
56034	鰜	\N	qiàn	big-mouthed flounder	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鰜')	56034
56035	㧚	\N	wǎ	(a dialect character) cup the hand, (corrupted form of U+65CA 瓬) clay pottery; earthenware	noun	\N	\N	古文	Classical C