In [3]:
import pandas as pd

import nltk

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#nltk.download('topwords')

from nltk.corpus import stopwords
stops = stopwords.words('english')
from string import punctuation

In [5]:
#tokenize(斷詞) 這個動詞的意思就是，把一個句子拆成一個個的單字。以下示範nltk中的兩種tokenize的方式。

testStr = "This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."

#nltk.download('punkt')

tokens = nltk.word_tokenize(testStr)
print(tokens)
tokens = nltk.wordpunct_tokenize(testStr) ## 請注意，差異在cut-off
print(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\02445\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
['This', 'value', 'is', 'also', 'called', 'cut-off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.', 'This', 'parameter', 'is', 'ignored', 'if', 'vocabulary', 'is', 'not', 'None', '.']
['This', 'value', 'is', 'also', 'called', 'cut', '-', 'off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.', 'This', 'parameter', 'is', 'ignored', 'if', 'vocabulary', 'is', 'not', 'None', '.']


In [7]:
#stemming and lemmatize stemming和lemmatize是一個把所有不同時態或是不同變化相同的字變成同一個字。
#而stemming比較像是去掉ed或是s這種添加在字後面的小字母，lemmatize則是字根化，就是把字還原到字根的型態。以下讓我們來看一下示範。

#nltk.download('wordnet')

df = pd.DataFrame(index = tokens)
df['porter_stemmer'] = [porter_stemmer.stem(t) for t in tokens]
df['lancaster_stemmer'] = [lancaster_stemmer.stem(t) for t in tokens]
df['snowball_stemmer'] = [snowball_stemmer.stem(t) for t in tokens]
df['wordnet_lemmatizer'] = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\02445\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,porter_stemmer,lancaster_stemmer,snowball_stemmer,wordnet_lemmatizer
This,thi,thi,this,This
value,valu,valu,valu,value
is,is,is,is,is
also,also,also,also,also
called,call,cal,call,called
cut,cut,cut,cut,cut
-,-,-,-,-
off,off,off,off,off
in,in,in,in,in
the,the,the,the,the


In [8]:
#前處理

#不過在前處理上，我們除了會使用tokenize配上stemming或是lemmatize之外，還會把英文字轉乘小寫，看句子的長度決定要不要把停用字跟標點符號拿掉。

#停用字: 這邊先使用nltk內建的停用字

df = pd.DataFrame(index = [t for t in tokens if t not in stops])
df['porter_stemmer'] = [porter_stemmer.stem(t.lower()) for t in tokens if t not in stops]
df['lancaster_stemmer'] = [lancaster_stemmer.stem(t.lower()) for t in tokens if t not in stops]
df['snowball_stemmer'] = [snowball_stemmer.stem(t.lower()) for t in tokens if t not in stops]
df['wordnet_lemmatizer'] = [wordnet_lemmatizer.lemmatize(t.lower()) for t in tokens if t not in stops]
df

Unnamed: 0,porter_stemmer,lancaster_stemmer,snowball_stemmer,wordnet_lemmatizer
This,thi,thi,this,this
value,valu,valu,valu,value
also,also,also,also,also
called,call,cal,call,called
cut,cut,cut,cut,cut
-,-,-,-,-
literature,literatur,lit,literatur,literature
.,.,.,.,.
If,if,if,if,if
float,float,flo,float,float


In [11]:
#Tag 目前套件已經很方便，讓大家可以在每一個詞上面標註詞性，不過大家要注意，
#為了讓詞性標註更準確，建議在標註詞性時，不要使用stemming、lemmatize、lower或是去除停用字或標點符號。

#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')

df_tag = pd.DataFrame(index = tokens)
df_tag['default'] = nltk.pos_tag(tokens)
df_tag['universal'] = nltk.pos_tag(tokens, tagset='universal')
df_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\02445\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\02445\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


Unnamed: 0,default,universal
This,"(This, DT)","(This, DET)"
value,"(value, NN)","(value, NOUN)"
is,"(is, VBZ)","(is, VERB)"
also,"(also, RB)","(also, ADV)"
called,"(called, VBN)","(called, VERB)"
cut,"(cut, VBN)","(cut, VERB)"
-,"(-, :)","(-, .)"
off,"(off, RB)","(off, ADV)"
in,"(in, IN)","(in, ADP)"
the,"(the, DT)","(the, DET)"
