# Terminology Extraction 

This notebook does terminology extraction from bigrams found in the corpus analysis

In [102]:
import numpy as np
import pandas as pd

from char_util import ToSimplified
from term_extraction import check_ngrams, load_headwords

# Loads the headwords from the Ding Fubao Buddhist dictionary
dingfubao = load_headwords('dingfubao.txt')
# Check the bigrams from corpus analysis against the Ding Fubao headwords
check_ngrams(dingfubao)

# Report on new words
colnames = ['Traditional', 'Occurences']
types = {'traditional': unicode, 'Occurences': np.uint32}
df = pd.read_table('newwords.txt')
numnew = df.count()[0]
print "Number of new words: ", numnew
print df.head(40)

Loading:  # Ding Fubao Buddhist Dictionary 《丁福保佛學大辭典》, source http://buddhaspace.org/dict/dfb/data/

Number of new words:  9958
   Traditional  Occurences
0           心王         996
1           摩納         994
2           領解         990
3           泥犁         988
4          三昧門         986
5          惡知識         986
6           解行         985
7           上人         979
8           受食         977
9           實智         975
10          十惡         967
11          提舍         962
12          拔濟         960
13          佛乘         959
14          象王         958
15          入空         957
16          攝心         955
17         心解脫         953
18          取蘊         952
19          十門         950
20          一識         950
21          勤行         950
22          留難         949
23          見分         948
24          料簡         948
25        阿修羅王         946
26          僧殘         945
27         學無學         943
28          一地         941
29         苦法智         940
30          信行         940
31      

In [100]:
# Load NTI Reader dictionary
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
nti_df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types)
trad = u"曹山"
url = u"http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=%s&submit=Search" % trad
print url

http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=曹山&submit=Search


In [101]:
# Create a word entry
luid = nti_df.count()[0] + 1
simplified, traditional, pinyin = ToSimplified(trad)
english = u"Benji"
grammar = u"proper noun"
# Concept
empty = u"\\N\t\\N"
arhat = u"罗汉\tArhat"
author = u"作家\tAuthor"
bodhisattva = u"菩萨\tBodhisattva"
book = u"书\tBook"
canonical = u"典籍\tCanonical Text"
monastic = u"师父\tMonastic"
place = u"地名\tPlace Name"
service = u"法会\tDharma Service"
temple = u"寺院\tTemple"
concept = place

# Domain
buddhism = u"佛教\tBuddhism"
classical = u"古文\tClassical Chinese"
culture = u"文化\tCulture"
education = u"教育\tEducation"
emotion = u"感情\tEmotion"
food = u"饮食\tFood and Drink"
history = u"历史\tHistory"
idiom = u"成语\tIdiom"
poetry = u"诗\tPoetry"
politics = u"政治\tPolitics"
psychology = u"心理学\tPsychology"
social_interaction = u"交际\tSocial Interaction"
thought = u"思想\tThought"
domain = history

china = u"中国\tChina"
chinese_buddhism = u"中国佛教\tChinese Buddhism"
chan = u"禅宗\tChan"
esoteric = u"密教\tEsoteric Buddhism"
fgs = u"佛光山\tFo Guang Shan"
korea = u"韓國\tKorea"
india = u"印度\tIndia"
indian = u"印度佛教\tIndian Buddhism"
mahayana = u"大乘佛教\tMahāyāna Buddhism"
subdomain = chinese_buddhism

bcsd = u"BCSD '%s'" % trad
notes = u"(Ding '%s'; FGDB '%s'; SH '%s')" % (trad, trad, trad)
#notes = u"See 本寂"
#notes = u"(CC-CEDICT '%s'; Guoyu '%s')" % (trad, trad)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, traditional, pinyin, english, grammar, 
      concept, domain, subdomain, notes, luid)

64185	曹山	\N	Cáoshān	Benji	proper noun	地名	Place Name	历史	History	中国佛教	Chinese Buddhism	\N	\N	(Ding '曹山'; FGDB '曹山'; SH '曹山')	64185
