# Notebook for importing from the Unihan Database

In [23]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [24]:
chinese = u'羮'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
103897,U+7FAE,kCantonese,gang1
103898,U+7FAE,kDefinition,"soup, broth"
103899,U+7FAE,kJapaneseKun,ATSUMONO
103900,U+7FAE,kJapaneseOn,KOU KAN
103901,U+7FAE,kMandarin,gēng
103902,U+7FAE,kVietnamese,canh


In [25]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value
6639,U+7FAE,kSemanticVariant,"U+7FB9<kMatthews,kMeyerWempe"
6640,U+7FAE,kZVariant,U+7FB9


In [15]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

chariot
fén

轒
noun
45706	轒	\N	fén	chariot	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '轒')	45706


In [16]:
# Generate lexical entries for the characters in file unknown.txt
luid = 51655
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+8F52,轒
1,U+88A5,袥
2,U+5113,儓
3,U+9E11,鸑
4,U+9CFD,鳽
5,U+9ABD,骽
6,U+50EC,僬
7,U+7B35,笵
8,U+8745,蝅
9,U+9DEB,鷫


In [17]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+8F52,轒,kCantonese,fan4
1,U+8F52,轒,kDefinition,chariot
2,U+8F52,轒,kHangul,분
3,U+8F52,轒,kHanyuPinyin,53555.040:fén
4,U+8F52,轒,kJapaneseOn,FUN BUN
5,U+8F52,轒,kKorean,PWUN
6,U+8F52,轒,kMandarin,fén
7,U+88A5,袥,kHanyuPinyin,53082.070:tuō
8,U+88A5,袥,kMandarin,tuō
9,U+5113,儓,kCantonese,toi4


In [18]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+349D,to deceive; artful; false,,,,miè,
U+35BF,"a sound; a voice; a tone, an interjection; to ...",,,,xié,
U+3817,"(same as 奧 嶴) deep in the mountain, name of a ...",,,,ào,
U+3AE4,"(a variant of 昶) a long day, bright, extended,...",,,,chǎng,
U+3B30,(non-classical form of 臾) a moment; an instant...,,,,yú,
U+3B9A,"(an ancient form of 栗) the chestnut tree, a ki...",,,,lì,
U+3D75,"swift currents of the stream, sound of water f...",,,,zòu,
U+3E15,(same as 攫) to seize; to take hold of; to snatch,,,,jué,
U+4046,"eyes, closed eyes",,,,yè,
U+455E,"Henbane, poisonous, seeds for medical use, a p...",,,,làng,


In [19]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+349D,to deceive; artful; false,\N,\N,\N,miè,\N
U+35BF,"a sound; a voice; a tone, an interjection; to ...",\N,\N,\N,xié,\N
U+3817,"(same as 奧 嶴) deep in the mountain, name of a ...",\N,\N,\N,ào,\N
U+3AE4,"(a variant of 昶) a long day, bright, extended,...",\N,\N,\N,chǎng,\N
U+3B30,(non-classical form of 臾) a moment; an instant...,\N,\N,\N,yú,\N
U+3B9A,"(an ancient form of 栗) the chestnut tree, a ki...",\N,\N,\N,lì,\N
U+3D75,"swift currents of the stream, sound of water f...",\N,\N,\N,zòu,\N
U+3E15,(same as 攫) to seize; to take hold of; to snatch,\N,\N,\N,jué,\N
U+4046,"eyes, closed eyes",\N,\N,\N,yè,\N
U+455E,"Henbane, poisonous, seeds for medical use, a p...",\N,\N,\N,làng,\N


In [20]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
0,U+8F52,轒,chariot,분,\N,PWUN,fén,\N
1,U+88A5,袥,\N,\N,\N,\N,tuō,\N
2,U+5113,儓,servant,대,\N,TAY,tái,\N
3,U+9E11,鸑,"a large, duck-like waterfowl with red eyes; a ...",\N,\N,\N,yuè,\N
4,U+9CFD,鳽,\N,\N,\N,\N,jiān,\N
5,U+9ABD,骽,"leg, thigh",\N,\N,\N,tuǐ,\N
6,U+50EC,僬,clever; alert in mind pigmies,초,\N,CHO,jiāo,\N
7,U+7B35,笵,a bamboo form; a model,범,\N,PEM,fàn,\N
8,U+8745,蝅,\N,\N,\N,\N,cán,\N
9,U+9DEB,鷫,turquoise kingfisher,숙,\N,SWUK,sù,\N


In [21]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang,id,traditional,...,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+8F52,轒,chariot,분,\N,PWUN,fén,\N,51655,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '轒'),51655
1,U+88A5,袥,\N,\N,\N,\N,tuō,\N,51656,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '袥'),51656
2,U+5113,儓,servant,대,\N,TAY,tái,\N,51657,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '儓'),51657
3,U+9E11,鸑,"a large, duck-like waterfowl with red eyes; a ...",\N,\N,\N,yuè,\N,51658,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鸑'),51658
4,U+9CFD,鳽,\N,\N,\N,\N,jiān,\N,51659,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鳽'),51659
5,U+9ABD,骽,"leg, thigh",\N,\N,\N,tuǐ,\N,51660,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '骽'),51660
6,U+50EC,僬,clever; alert in mind pigmies,초,\N,CHO,jiāo,\N,51661,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '僬'),51661
7,U+7B35,笵,a bamboo form; a model,범,\N,PEM,fàn,\N,51662,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '笵'),51662
8,U+8745,蝅,\N,\N,\N,\N,cán,\N,51663,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '蝅'),51663
9,U+9DEB,鷫,turquoise kingfisher,숙,\N,SWUK,sù,\N,51664,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鷫'),51664


In [22]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

51655	轒	\N	fén	chariot	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '轒')	51655
51656	袥	\N	tuō	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '袥')	51656
51657	儓	\N	tái	servant	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '儓')	51657
51658	鸑	\N	yuè	a large, duck-like waterfowl with red eyes; a young phoenix	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鸑')	51658
51659	鳽	\N	jiān	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鳽')	51659
51660	骽	\N	tuǐ	leg, thigh	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '骽')	51660
51661	僬	\N	jiāo	clever; alert in mind pigmies	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '僬')	51661
51662	笵	\N	fàn	a bamboo form; a model	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '笵')	51662
51663	蝅	\N	cán	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '蝅')	51663
51664	鷫	\N	sù	turquoise kingfisher	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鷫')	51664
51665	斄	\N	lí	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N