# Notebook for importing from the Unihan Database

In [1]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import sys

import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print(readings['codepoint'].count()) # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print(variants['codepoint'].count()) # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [2]:
chinese = u'𪙾'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
187305,U+2A67E,kHanyuPinyin,74802.040:yín
187306,U+2A67E,kMandarin,yín


In [3]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value


In [10]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print(english)

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0]
print(pinyin)

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print(notes)

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print(simplified)

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print(grammar)

luid = 57940
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print(u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid))

\N
yín

𪙾
\N
57940	𪙾	\N	yín	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	 (Unihan '𪙾')	57940


In [29]:
# Generate lexical entries for the characters in file unknown.txt
luid = 102915
colnames = ['codepoint', 'char']
types = {'char': np.string_, 'codepoint': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,codepoint,char
0,U+6712,朒
1,U+9560,镠
2,U+8162,腢
3,U+8D5F,赟
4,U+74AE,璮
5,U+5F93,従
6,U+7983,禃
7,U+92EE,鋮
8,U+6EB5,溵
9,U+4E93,亓


In [30]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,codepoint,char,fieldname,value
0,U+6712,朒,kCantonese,nuk6
1,U+6712,朒,kHanyuPinyin,32067.190:nǜ
2,U+6712,朒,kJapaneseOn,NIKU JIKU
3,U+6712,朒,kMandarin,nǜ
4,U+6712,朒,kXHC1983,0842.060:nǜ
5,U+9560,镠,kCantonese,lau4
6,U+9560,镠,kDefinition,pure gold
7,U+9560,镠,kJapaneseOn,RYUU
8,U+9560,镠,kMandarin,liú
9,U+9560,镠,kXHC1983,0727.050:liú


In [31]:
try:
  pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
  del pivoted["kCantonese"]
  del pivoted["kHanyuPinyin"]
  del pivoted["kJapaneseKun"]
  del pivoted["kJapaneseOn"]
  del pivoted["kVietnamese"]
  del pivoted["kXHC1983"]
  pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
  pivoted
except KeyError as ke:
  print('KeyError, ', ke )

In [32]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U+2016C,\N,\N,\N,\N,bāo,\N
U+201A9,(Cant.) to play,\N,\N,\N,fàn,\N
U+201D7,\N,\N,\N,\N,kuā,\N
U+203FF,\N,\N,\N,\N,fěi,\N
U+205E6,to lean on; to trust in,\N,\N,\N,píng,\N
U+20646,\N,\N,\N,\N,jù,\N
U+20732,\N,\N,\N,\N,yā,\N
U+20764,\N,\N,\N,\N,chōng,\N
U+207CD,\N,\N,\N,\N,chōng,\N
U+2080E,\N,\N,\N,\N,kuò,\N


In [33]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang
0,U+6712,朒,\N,\N,\N,\N,nǜ,\N
1,U+9560,镠,pure gold,\N,\N,\N,liú,\N
2,U+8162,腢,the collar-bone,\N,\N,\N,ǒu,\N
3,U+8D5F,赟,"affable, agreeable, pleasant",\N,\N,YUN,yūn,\N
4,U+74AE,璮,\N,\N,\N,\N,tǎn,\N
5,U+5F93,従,"from, by, since, whence, through",\N,\N,CONG,cóng,\N
6,U+7983,禃,\N,\N,\N,\N,zhí,\N
7,U+92EE,鋮,person's name,\N,\N,\N,chéng,\N
8,U+6EB5,溵,\N,은,\N,UN,yīn,\N
9,U+4E93,亓,"(archaic form) his, her, its, their; that",\N,\N,KI,qí,\N


In [34]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
getGrammar = lambda x: "noun" if x.english != "\\N" else "\\N"
unknownDF['grammar'] = unknownDF.apply(getGrammar, axis = 1)
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

Unnamed: 0,codepoint,char,english,kHangul,kHanyuPinlu,kKorean,pinyin,kTang,id,traditional,...,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en,mp3,image,notes,headword
0,U+6712,朒,\N,\N,\N,\N,nǜ,\N,102915,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '朒'),102915
1,U+9560,镠,pure gold,\N,\N,\N,liú,\N,102916,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '镠'),102916
2,U+8162,腢,the collar-bone,\N,\N,\N,ǒu,\N,102917,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '腢'),102917
3,U+8D5F,赟,"affable, agreeable, pleasant",\N,\N,YUN,yūn,\N,102918,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '赟'),102918
4,U+74AE,璮,\N,\N,\N,\N,tǎn,\N,102919,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '璮'),102919
5,U+5F93,従,"from, by, since, whence, through",\N,\N,CONG,cóng,\N,102920,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '従'),102920
6,U+7983,禃,\N,\N,\N,\N,zhí,\N,102921,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '禃'),102921
7,U+92EE,鋮,person's name,\N,\N,\N,chéng,\N,102922,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '鋮'),102922
8,U+6EB5,溵,\N,은,\N,UN,yīn,\N,102923,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '溵'),102923
9,U+4E93,亓,"(archaic form) his, her, its, their; that",\N,\N,KI,qí,\N,102924,\N,...,\N,\N,古文,Classical Chinese,\N,\N,\N,\N,(Unihan '亓'),102924


In [36]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", columns = cols)

102915	朒	\N	nǜ	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '朒')	102915
102916	镠	\N	liú	pure gold	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '镠')	102916
102917	腢	\N	ǒu	the collar-bone	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '腢')	102917
102918	赟	\N	yūn	affable, agreeable, pleasant	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '赟')	102918
102919	璮	\N	tǎn	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '璮')	102919
102920	従	\N	cóng	from, by, since, whence, through	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '従')	102920
102921	禃	\N	zhí	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '禃')	102921
102922	鋮	\N	chéng	person's name	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '鋮')	102922
102923	溵	\N	yīn	\N	\N	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '溵')	102923
102924	亓	\N	qí	(archaic form) his, her, its, their; that	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Unihan '亓')	102924
102925	紥	\N	zā	tie, fasten, bind	noun	\N	\N	古文	Class