In [3]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import nltk
# https://www.nltk.org/nltk_data/
#nltk.download('wordnet')
## 創建stemmer
ps = PorterStemmer()

## 創建Lemmatizer
lemmatizer = WordNetLemmatizer() 

### example

In [4]:
print('Porter Stemmer on amusing: {}'.format(ps.stem('amusing')))
print('Lemmatization on amusing with pos=v: {}'.format(lemmatizer.lemmatize('amusing', pos='v')))
print('Lemmatization on amusing with pos=n: {}'.format(lemmatizer.lemmatize('amusing', pos='n')))

Porter Stemmer on amusing: amus
Lemmatization on amusing with pos=v: amuse
Lemmatization on amusing with pos=n: amusing


### 運用tokenize技巧結合stemming提取每個單詞的詞幹

In [5]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

stemming_output = ' '.join([ps.stem(w) for w in word_list])
print(stemming_output)
#> The striped bat are hanging on their foot for best

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
the stripe bat are hang on their feet for best


### 運用tokenize技巧結合lemmatize提取每個單詞的lemma

In [6]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w, pos="n") for w in word_list])
print(lemmatized_output)
lemmatized_output = ' '.join([lemmatizer.lemmatize(w, pos="v") for w in word_list])
print(lemmatized_output)
#> The striped bat are hanging on their foot for best

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
The striped bat are hanging on their foot for best
The strip bat be hang on their feet for best


### 有時單詞的lemma會隨著詞性而有所改變

In [7]:
print('Lemmatization amusing: {}'.format(lemmatizer.lemmatize('amusing', pos='v'))) # Verb
print('Lemmatization amusing: {}'.format(lemmatizer.lemmatize('amusing', pos='a'))) # Adjective
print('Lemmatization amusing: {}'.format(lemmatizer.lemmatize('amusing', pos='n'))) # Noun

Lemmatization amusing: amuse
Lemmatization amusing: amusing
Lemmatization amusing: amusing


### 運用pos_tag技巧結合lemmatize提取每個單詞的lemma

In [8]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    print(nltk.pos_tag([word]))
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [9]:
word = 'using'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

[('using', 'VBG')]
use


### Lemmatize 字串中每個單詞並加入 POS tag

In [10]:
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(sentence)])
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

[('The', 'DT')]
[('striped', 'VBD')]
[('bats', 'NNS')]
[('are', 'VBP')]
[('hanging', 'VBG')]
[('on', 'IN')]
[('their', 'PRP$')]
[('feet', 'NNS')]
[('for', 'IN')]
[('best', 'JJS')]
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
