# Notebook for exploring the Unihan Database

In [1]:
# Assumes the Unihan database has been downloaded from http://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
# and placed in the directory ../Unihan
import pandas as pd
import numpy as np

# Load Unihan Readings table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
readings = pd.read_table('../Unihan/Unihan_Readings.txt', names=colnames, dtype=types)
readings.set_index("codepoint")
print readings['codepoint'].count() # Number of rows

# Load Unihan Variants table
colnames = ['codepoint', 'fieldname', 'value']
types = {'codepoint': np.string_, 'fieldname': np.string_, 'value': np.string_}
variants = pd.read_table('../Unihan/Unihan_Variants.txt', names=colnames, dtype=types)
print variants['codepoint'].count() # Number of rows

readings.tail()

187561
12375


Unnamed: 0,codepoint,fieldname,value
187556,U+2F994,kCantonese,fong1
187557,U+2F9B2,kCantonese,kwai4
187558,U+2F9BC,kCantonese,sip3
187559,U+2F9D4,kCantonese,gun3 gwun3
187560,# EOF,,


In [2]:
chinese = u'敺'
cp = 'U+' + hex(ord(chinese)).replace('0x', '').upper()
chineseDf = readings[readings.codepoint == cp]
chineseDf

Unnamed: 0,codepoint,fieldname,value
63452,U+657A,kCantonese,au2 keoi1
63453,U+657A,kDefinition,"expel, drive away; beat, assault"
63454,U+657A,kHanyuPinyin,"21472.120:qū,ōu"
63455,U+657A,kJapaneseKun,KAAKERU KAARU
63456,U+657A,kJapaneseOn,KU
63457,U+657A,kMandarin,qū
63458,U+657A,kXHC1983,0942.052:qū


In [3]:
variantsDf = variants[variants.codepoint == cp]
variantsDf

Unnamed: 0,codepoint,fieldname,value
3558,U+657A,kSemanticVariant,"U+9A45<kLau,kMatthews"


In [4]:
english = "\\N"
englishDF = chineseDf[chineseDf.fieldname == 'kDefinition']['value']
if len(englishDF) > 0:
    english = englishDF.iloc[0]
print english

pinyin = chineseDf[chineseDf.fieldname == 'kMandarin']['value'].iloc[0].decode('utf-8')
print pinyin

# Kinds of variants
notes = ""
c = ""
value = variantsDf[variantsDf.fieldname == 'kSemanticVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    c = unichr(int(val, 16))
    notes = "Semantic variant: %s " % c
print notes

simplified = chinese
traditional = "\\N"
value = variantsDf[variantsDf.fieldname == 'kSimplifiedVariant']['value']
if len(value) > 0:
    val = value.iloc[0].split('<')[0]
    val = val.replace("U+", "")
    simplified = unichr(int(val, 16))
    traditional = chinese
print simplified

grammar = "\\N"
if english != "\\N":
    grammar = "noun"
print grammar

luid = 45706
notes = "%s (Unihan '%s')" % (notes, chinese)
domain = u"古文\tClassical Chinese"
print u"%d\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t\\N\t\\N\t\\N\t\\N\t%s\t%d" % (
    luid, simplified, traditional, pinyin, english, grammar, domain, notes, luid)

expel, drive away; beat, assault
qū
Semantic variant: 驅 
敺
noun
45706	敺	\N	qū	expel, drive away; beat, assault	noun	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	Semantic variant: 驅  (Unihan '敺')	45706


In [37]:
# Generate lexical entries for the characters in file unknown.txt
luid = 45716
colnames = ['char']
types = {'char': np.string_}
unknownDF = pd.read_table('unknown.txt', names = colnames, dtype=types, header = None)
unknownDF

Unnamed: 0,char
0,鯹
1,躘
2,敺
3,跙


In [38]:
cp = lambda c: 'U+' + hex(ord(c.decode("utf-8"))).replace('0x', '').upper()
unknownDF["codepoint"] = unknownDF['char'].map(cp)
unknownDF

Unnamed: 0,char,codepoint
0,鯹,U+9BF9
1,躘,U+8E98
2,敺,U+657A
3,跙,U+8DD9


In [39]:
unknownReadings = pd.merge(unknownDF, readings, on = "codepoint")
unknownReadings.set_index("codepoint")
unknownReadings

Unnamed: 0,char,codepoint,fieldname,value
0,鯹,U+9BF9,kJapaneseKun,NAMAGUSAI
1,鯹,U+9BF9,kJapaneseOn,SEI SHOU SOU
2,鯹,U+9BF9,kMandarin,xīng
3,躘,U+8E98,kCantonese,lung4
4,躘,U+8E98,kDefinition,to walk
5,躘,U+8E98,kHanyuPinyin,"63749.100:lóng,lǒng"
6,躘,U+8E98,kMandarin,lóng
7,躘,U+8E98,kVietnamese,ruông
8,敺,U+657A,kCantonese,au2 keoi1
9,敺,U+657A,kDefinition,"expel, drive away; beat, assault"


In [40]:
pivoted = unknownReadings.pivot("codepoint", "fieldname", "value")
del pivoted["kCantonese"]
del pivoted["kHanyuPinyin"]
del pivoted["kJapaneseKun"]
del pivoted["kJapaneseOn"]
del pivoted["kVietnamese"]
del pivoted["kXHC1983"]
pivoted = pivoted.rename(columns = {"kDefinition":"english", "kMandarin":"pinyin"})
pivoted

fieldname,english,pinyin
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1
U+657A,"expel, drive away; beat, assault",qū
U+8DD9,"weak, lame",jù
U+8E98,to walk,lóng
U+9BF9,,xīng


In [41]:
pivoted = pivoted.fillna("\\N")
pivoted

fieldname,english,pinyin
codepoint,Unnamed: 1_level_1,Unnamed: 2_level_1
U+657A,"expel, drive away; beat, assault",qū
U+8DD9,"weak, lame",jù
U+8E98,to walk,lóng
U+9BF9,\N,xīng


In [42]:
unknownDF = unknownDF.join(pivoted, on = "codepoint", lsuffix = "_")
unknownDF

Unnamed: 0,char,codepoint,english,pinyin
0,鯹,U+9BF9,\N,xīng
1,躘,U+8E98,to walk,lóng
2,敺,U+657A,"expel, drive away; beat, assault",qū
3,跙,U+8DD9,"weak, lame",jù


In [55]:
unknownDF['id'] = range(luid, luid + len(unknownDF))
unknownDF['traditional'] = "\\N"
unknownDF['grammar'] = "verb"
unknownDF['concept_cn'] = "\\N"
unknownDF['concept_en'] = "\\N"
unknownDF['domain_cn'] = "古文"
unknownDF['domain_en'] = "Classical Chinese"
unknownDF['subdomain_cn'] = "\\N"
unknownDF['subdomain_en'] = "\\N"
unknownDF['mp3'] = "\\N"
unknownDF['image'] = "\\N"
writeNotes = lambda c: "(Unihan '" + c + "')"
unknownDF['notes'] = unknownDF['char'].map(writeNotes)
unknownDF['headword'] = unknownDF['id']
unknownDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,id,char,traditional,pinyin,english,grammar,concept,domain,subdomain,mp3,image,notes,headword,concept_cn,concept_en,domain_cn,domain_en,subdomain_cn,subdomain_en
0,45716,鯹,\N,xīng,\N,verb,\N\t\N,古文\tClassical Chinese,\N\t\N,\N,\N,(Unihan '鯹'),45716,\N,\N,古文,Classical Chinese,\N,\N
1,45717,躘,\N,lóng,to walk,verb,\N\t\N,古文\tClassical Chinese,\N\t\N,\N,\N,(Unihan '躘'),45717,\N,\N,古文,Classical Chinese,\N,\N
2,45718,敺,\N,qū,"expel, drive away; beat, assault",verb,\N\t\N,古文\tClassical Chinese,\N\t\N,\N,\N,(Unihan '敺'),45718,\N,\N,古文,Classical Chinese,\N,\N
3,45719,跙,\N,jù,"weak, lame",verb,\N\t\N,古文\tClassical Chinese,\N\t\N,\N,\N,(Unihan '跙'),45719,\N,\N,古文,Classical Chinese,\N,\N


In [54]:
cols = ['id', 'char', 'traditional', 'pinyin', 'english', 'grammar', 'concept_cn', 'concept_en', 'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en', 'mp3', 'image', 'notes', 'headword']
unknownDF = unknownDF[cols]
unknownDF.to_csv(sys.stdout, index = False, header = False, sep = "\t", cols = cols)

KeyError: "['concept_cn' 'concept_en' 'domain_cn' 'domain_en' 'subdomain_cn'\n 'subdomain_en'] not in index"