In [1]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download(['wordnet','punkt','averaged_perceptron_tagger'])
## 創建stemmer
ps=PorterStemmer()

## 創建Lemmatizer
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### example

In [2]:
print('Stemming amusing : {}'.format(ps.stem('amusing')))
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v')))

Stemming amusing : amus
lemmatization amusing : amuse


### 運用tokenize技巧結合stemming提取每個單詞的詞幹

In [3]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

stemming_output = ' '.join([ps.stem(w) for w in word_list])
print(stemming_output)
#> the stripe bat are hang on their feet for best

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
the stripe bat are hang on their feet for best


### 運用tokenize技巧結合lemmatize提取每個單詞的lemma

In [4]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)
#> The striped bat are hanging on their foot for best

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
The striped bat are hanging on their foot for best


### 有時單詞的lemma會隨著詞性而有所改變

In [5]:
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v'))) ##動詞
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'a'))) ##形容詞

lemmatization amusing : amuse
lemmatization amusing : amusing


### 運用pos_tag技巧結合lemmatize提取每個單詞的lemma

In [6]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """
    word_tokenize(s):Tokenize a string
    return:list(str)

    將pos_tag結果mapping到lemmatizer中pos的格式
    input:a list of string
    return:[(word , pos)]
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    print(tag)
    tag_dict = {"J": wordnet.ADJ,
           "N": wordnet.NOUN,
           "V": wordnet.VERB,
           "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [7]:
word = 'using'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

V
use


### Lemmatize 字串中每個單詞並加入 POS tag

In [8]:
sentence = "The striped bats are hanging on their feet for best"

word_list = nltk.word_tokenize(sentence)
lemma = [lemmatizer.lemmatize(w) for w in word_list]

print(nltk.pos_tag(lemma))
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

[('The', 'DT'), ('striped', 'JJ'), ('bat', 'NN'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('foot', 'NN'), ('for', 'IN'), ('best', 'JJS')]
