# Terminology Extraction 

This notebook does terminology extraction from bigrams found in the corpus analysis

In [689]:
import numpy as np
import pandas as pd

from char_util import ToSimplified
from term_extraction import check_ngrams, load_headwords

# Loads the headwords from the Ding Fubao Buddhist dictionary
dingfubao = load_headwords('dingfubao.txt')
# Check the bigrams from corpus analysis against the Ding Fubao headwords
check_ngrams(dingfubao)

# Report on new words
colnames = ['Traditional', 'Occurences']
types = {'traditional': unicode, 'Occurences': np.uint32}
df = pd.read_table('newwords.txt')
numnew = df.count()[0]
print "Number of new words: ", numnew
print df.head(20)


Loading:  # Ding Fubao Buddhist Dictionary 《丁福保佛學大辭典》, source http://buddhaspace.org/dict/dfb/data/

Number of new words:  10053
   Traditional  Occurences
0           一類        1987
1          無上道        1976
2           佛地        1970
3           釋子        1969
4           隨逐        1965
5          自相空        1961
6           一明        1958
7           波旬        1942
8           四分        1933
9           等持        1931
10          正觀        1916
11          身界        1902
12          能緣        1892
13          教誡        1874
14          說經        1856
15          一佛        1851
16          二邊        1844
17          知法        1844
18         三菩提        1843
19          覺觀        1840


In [690]:
# Load NTI Reader dictionary
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
nti_df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types)
trad = u"一類"
url = u"http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=%s&submit=Search" % trad
print url

http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=一類&submit=Search


In [691]:
# Create a word entry
luid = nti_df.count()[0] + 1
simplified, traditional, pinyin = ToSimplified(trad)
english = u"the same kind"
grammar = u"set phrase"
# Concept
empty = u"\\N\t\\N"
author = u"作家\tAuthor"
book = u"书\tBook"
canonical = u"典籍\tCanonical Text"
monastic = u"师父\tMonastic"
service = u"法会\tDharma Service"
concept = empty
# Domain
buddhism = u"佛教\tBuddhism"
classical = u"古文\tClassical Chinese"
domain = classical
chinese_buddhism = u"中国佛教\tChinese Buddhism"
chan = u"禅宗\tChan"
esoteric = u"密教\tEsoteric Buddhism"
mahayana = u"大乘佛教\tMahāyāna Buddhism"
subdomain = empty
notes = u"(Ding '%s'; FGDB '%s'; SH '%s')" % (trad, trad, trad)
#notes = u"See 證入"
#notes = u"(CC-CEDICT '%s'; Guoyu '%s')" % (trad, trad)
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, traditional, pinyin, english, grammar, 
      concept, domain, subdomain, notes, luid)

62680	一类	一類	yīlèi	the same kind	set phrase	\N	\N	古文	Classical Chinese	\N	\N	\N	\N	(Ding '一類'; FGDB '一類'; SH '一類')	62680
