## Official document：http://www.nltk.org/book/ch01.html
- Editor: Youngmi Huang
- Update: 2018/06

In [1]:
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Wordnet

## 1. 找出同義詞集合

In [2]:
wn.synsets('motorcar')

[Synset('car.n.01')]

## 2. 找出同義字

In [3]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [4]:
# 多義字的例子（多個synsets）
for synset in wn.synsets('trunk'):
    print(synset.lemma_names())

['trunk', 'tree_trunk', 'bole']
['trunk']
['torso', 'trunk', 'body']
['luggage_compartment', 'automobile_trunk', 'trunk']
['proboscis', 'trunk']


## 3. 查詢 synset 定義說明

In [5]:
# 同義字群組定義
wn.synset('trunk.n.01').definition()

'the main stem of a tree; usually covered with bark; the bole is usually the part that is commercially useful for lumber'

In [6]:
# 同義詞群組舉例
wn.synset('car.n.01').examples()

['he needs a car to get to work']

## 4. hypernym/hyponym (上位詞/下位詞)


In [7]:
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()

In [8]:
# motorcar的下位詞：找到同義詞組，再從同義詞組找出單詞（以詞為中心）
sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas())

['Model_T',
 'S.U.V.',
 'SUV',
 'Stanley_Steamer',
 'ambulance',
 'beach_waggon',
 'beach_wagon',
 'bus',
 'cab',
 'compact',
 'compact_car',
 'convertible',
 'coupe',
 'cruiser',
 'electric',
 'electric_automobile',
 'electric_car',
 'estate_car',
 'gas_guzzler',
 'hack',
 'hardtop',
 'hatchback',
 'heap',
 'horseless_carriage',
 'hot-rod',
 'hot_rod',
 'jalopy',
 'jeep',
 'landrover',
 'limo',
 'limousine',
 'loaner',
 'minicar',
 'minivan',
 'pace_car',
 'patrol_car',
 'phaeton',
 'police_car',
 'police_cruiser',
 'prowl_car',
 'race_car',
 'racer',
 'racing_car',
 'roadster',
 'runabout',
 'saloon',
 'secondhand_car',
 'sedan',
 'sport_car',
 'sport_utility',
 'sport_utility_vehicle',
 'sports_car',
 'squad_car',
 'station_waggon',
 'station_wagon',
 'stock_car',
 'subcompact',
 'subcompact_car',
 'taxi',
 'taxicab',
 'tourer',
 'touring_car',
 'two-seater',
 'used-car',
 'waggon',
 'wagon']

In [9]:
# 上位詞組
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

In [10]:
# 完整路徑（上位詞組再往上走）
motorcar.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('container.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')],
 [Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('conveyance.n.03'),
  Synset('vehicle.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')]]

In [11]:
# 上位詞：直接找到最上面一層
motorcar.root_hypernyms()

[Synset('entity.n.01')]

## 5. 找尋最低位共同詞組

In [12]:
# 以鯨魚為例子
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')
right.lowest_common_hypernyms(minke)

[Synset('baleen_whale.n.01')]

In [13]:
# 露脊鯨 vs 小鬚鯨
right.lowest_common_hypernyms(minke)

[Synset('baleen_whale.n.01')]

In [14]:
# 露脊鯨 vs 虎鯨
print(right.lowest_common_hypernyms(orca))     
# 露脊鯨 vs 陸龜
print(right.lowest_common_hypernyms(tortoise))
# 露脊鯨 vs 小說
print(right.lowest_common_hypernyms(novel))

[Synset('whale.n.02')]
[Synset('vertebrate.n.01')]
[Synset('entity.n.01')]


## 6. synset 之間的階層計算以及上下位結構的相似度

In [15]:
# 由下而上的階層
print(wn.synset('baleen_whale.n.01').min_depth())
print(wn.synset('whale.n.02').min_depth())
print(wn.synset('vertebrate.n.01').min_depth())
print(wn.synset('entity.n.01').min_depth())

14
13
8
0


In [16]:
# 上下文路徑的相似程度 (數字接近1代表path越像)
print(right.path_similarity(right))
print(right.path_similarity(minke))
print(right.path_similarity(orca))
print(right.path_similarity(tortoise))
print(right.path_similarity(novel))

1.0
0.25
0.16666666666666666
0.07692307692307693
0.043478260869565216


## 詞幹提取 ( Stemming )

In [17]:
# 初始化
pst = PorterStemmer()
lst = LancasterStemmer()
snow = SnowballStemmer('english')

In [18]:
print(pst.stem('eating'))
print(pst.stem('eats'))

eat
eat


In [19]:
print(lst.stem('eating'))
print(lst.stem('eats'))

eat
eat


In [20]:
print(snow.stem('eating'))
print(snow.stem('eats'))

eat
eat


In [21]:
# 自定義 examples1
example_words = ['enthusiasms', 'enthusiasm', 'enthusiastic', 'enthusiastically']
for i in example_words:
    print(pst.stem(i))

enthusiasm
enthusiasm
enthusiast
enthusiast


In [22]:
# 自定義 examples2
example_words = ['help', 'helped', 'helps', 'helpers', 'hope']
for i in example_words:
    print(pst.stem(i))

help
help
help
helper
hope


## 詞性還原 ( Lemmatization )

In [23]:
# 初始化
wtlem = WordNetLemmatizer()

In [24]:
# 未指定詞性
print(wtlem.lemmatize('ate'))
print(wtlem.lemmatize('eating'))

ate
eating


In [25]:
# 有指定詞性
print(wtlem.lemmatize('ate', 'v'))
print(wtlem.lemmatize('eating', 'v'))

eat
eat
