# Terminology Extraction 

This notebook does terminology extraction from bigrams found in the corpus analysis

In [4]:
import numpy as np
import pandas as pd

from char_util import ToSimplified
from term_extraction import check_ngrams, load_headwords

# Loads the headwords from the Ding Fubao Buddhist dictionary
dingfubao = load_headwords('dingfubao.txt')
# Check the bigrams from corpus analysis against the Ding Fubao headwords
check_ngrams(dingfubao)

# Report on new words
colnames = ['Traditional', 'Occurences']
types = {'traditional': unicode, 'Occurences': np.uint32}
df = pd.read_table('newwords.txt')
numnew = df.count()[0]
print "Number of new words: ", numnew
print df.head(10)

Loading:  # Ding Fubao Buddhist Dictionary 《丁福保佛學大辭典》, source http://buddhaspace.org/dict/dfb/data/

Number of new words:  9556
  Traditional  Occurences
0         涅槃界         454
1         安樂國         454
2          善神         453
3          壞色         452
4          中劫         452
5          修惑         452
6         無想處         452
7        無邊世界         451
8          怨親         451
9          二足         451


In [74]:
# Load NTI Reader dictionary
colnames = ['id', 'simplified', 'traditional', 'pinyin', 
            'english', 'grammar', 'concept_cn', 'concept_en', 
            'domain_cn', 'domain_en', 'subdomain_cn', 'subdomain_en',
            'image', 'mp3', 'notes', 'headword']
types = {'id': np.uint32, 'simplified': np.object, 'traditional': np.object, 
        'pinyin': np.object, 'english': np.object, 'grammar': np.object,
        'concept_cn': np.object, 'concept_en': np.object,
        'domain_cn': np.object, 'domain_en': np.object, 'subdomain_cn': np.object,
         'subdomain_en': np.object, 'image': np.object, 'mp3': np.object,
         'notes': np.object, 'headword': np.uint32}
nti_df = pd.read_table('../data/dictionary/words.txt', names=colnames, dtype=types, comment='#')
trad = u"似宗"
url = u"http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=%s&submit=Search" % trad
print url

http://buddhistinformatics.ddbc.edu.tw/glossaries/search.php?op=search&text=似宗&submit=Search


In [75]:
# Create a word entry
luid = nti_df.count()[0] + 2
simplified, traditional, pinyin = ToSimplified(trad)
english = u"pseudo thesis"
grammar = u"noun"
# Concept
empty = u"\\N\t\\N"
arhat = u"罗汉\tArhat"
author = u"作家\tAuthor"
bodhisattva = u"菩萨\tBodhisattva"
book = u"书名\tBook Title"
buddha = u"佛\tBuddha"
canonical = u"典籍\tCanonical Text"
character = u"人物\tCharacter"
enumeration = u"名数\tEnumeration"
deity = u"神\tDeity"
deva = u"天\tDeva"
deva_king = u"天王\tHeavenly King"
heaven = u"天\tHeaven"
koan = u"公案\tKoan"
king = u"国王\tKing"
kingdom = u"王国\tKingdom"
monastic = u"师父\tMonastic"
mudra = u"手印\tmudrā"
person = u"人\tPerson"
pagoda = u"佛塔\tPagoda"
post = u"职事\tPost"
place = u"地名\tPlace Name"
sastra_commentary = u"论疏\tŚastra Commentary"
scholar = u"学者\tScholar"
school = u"宗派\tSchool"
service = u"法会\tDharma Service"
state = u"国\tState"
sutra = u"经\tSutra"
temple = u"寺院\tTemple"
treatise = u"论\tTreatise"
concept = empty

# Domain
domain = u"佛教\tBuddhism"

china = u"中国\tChina"
chinese_buddhism = u"中国佛教\tChinese Buddhism"
chan = u"禅宗\tChan School"
esoteric = u"密教\tEsoteric Buddhism"
fgs = u"佛光山\tFo Guang Shan"
india = u"印度\tIndia"
indian = u"印度佛教\tIndian Buddhism"
japan = u"日本佛教\tJapanese Buddhism"
korea = u"韓國\tKorea"
logic = u"因明\tBuddhist Logic"
mahayana = u"大乘佛教\tMahāyāna Buddhism"
subdomain = logic

bcsd = u"BCSD '%s'" % trad
notes = u"(Ding '%s'; FGDB '%s'; SH '%s')" % (trad, trad, trad)
#notes = u"Same as 該攝 (FGDB '該攝')"
print u"%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\\N\t\\N\t%s\t%d" % (
      luid, simplified, traditional, pinyin, english, grammar, 
      concept, domain, subdomain, notes, luid)

102051	似宗	\N	sìzōng	pseudo thesis	noun	\N	\N	佛教	Buddhism	印度	India	\N	\N	(Ding '似宗'; FGDB '似宗'; SH '似宗')	102051
