In [2]:
# 匯入套件
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer



In [3]:
# 下載分詞器
nltk.download('punkt')

# 下載停用詞
nltk.download('stopwords')

# 下載已經定義字詞-語義關係來尋找上下位關係的英文詞典，也包含了同義詞、時態、名詞單複數等資訊
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xdxd2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xdxd2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xdxd2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 輸入文本
text = '''Hey Jude, don't make it bad. Take a sad song and make it better. Remember to let her into your heart. Then you can start to make it better. Hey Jude, don't be afraid. You were made to go out and get her. The minute you let her under your skin. Then you begin to make it better.'''

In [6]:
# 文本清洗，去除標點符號與數字等非字母的字符
text = ''.join(c for c in text if c.isalpha() or c.isspace()); text

'Hey Jude dont make it bad Take a sad song and make it better Remember to let her into your heart Then you can start to make it better Hey Jude dont be afraid You were made to go out and get her The minute you let her under your skin Then you begin to make it better'

In [10]:
# 分詞
tokens = word_tokenize(text); tokens


['Hey',
 'Jude',
 'dont',
 'make',
 'it',
 'bad',
 'Take',
 'a',
 'sad',
 'song',
 'and',
 'make',
 'it',
 'better',
 'Remember',
 'to',
 'let',
 'her',
 'into',
 'your',
 'heart',
 'Then',
 'you',
 'can',
 'start',
 'to',
 'make',
 'it',
 'better',
 'Hey',
 'Jude',
 'dont',
 'be',
 'afraid',
 'You',
 'were',
 'made',
 'to',
 'go',
 'out',
 'and',
 'get',
 'her',
 'The',
 'minute',
 'you',
 'let',
 'her',
 'under',
 'your',
 'skin',
 'Then',
 'you',
 'begin',
 'to',
 'make',
 'it',
 'better']

In [9]:
# 轉換為小寫
tokens = [word.lower() for word in tokens]; tokens

['hey',
 'jude',
 'dont',
 'make',
 'it',
 'bad',
 'take',
 'a',
 'sad',
 'song',
 'and',
 'make',
 'it',
 'better',
 'remember',
 'to',
 'let',
 'her',
 'into',
 'your',
 'heart',
 'then',
 'you',
 'can',
 'start',
 'to',
 'make',
 'it',
 'better',
 'hey',
 'jude',
 'dont',
 'be',
 'afraid',
 'you',
 'were',
 'made',
 'to',
 'go',
 'out',
 'and',
 'get',
 'her',
 'the',
 'minute',
 'you',
 'let',
 'her',
 'under',
 'your',
 'skin',
 'then',
 'you',
 'begin',
 'to',
 'make',
 'it',
 'better']

---

In [15]:
# 去除停用詞
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]; tokens

['Hey',
 'Jude',
 'dont',
 'make',
 'bad',
 'Take',
 'sad',
 'song',
 'make',
 'better',
 'Remember',
 'let',
 'heart',
 'Then',
 'start',
 'make',
 'better',
 'Hey',
 'Jude',
 'dont',
 'afraid',
 'You',
 'made',
 'go',
 'get',
 'The',
 'minute',
 'let',
 'skin',
 'Then',
 'begin',
 'make',
 'better']

In [16]:
tokens = [ word for word in tokens if word not in stop_words]; tokens


['Hey',
 'Jude',
 'dont',
 'make',
 'bad',
 'Take',
 'sad',
 'song',
 'make',
 'better',
 'Remember',
 'let',
 'heart',
 'Then',
 'start',
 'make',
 'better',
 'Hey',
 'Jude',
 'dont',
 'afraid',
 'You',
 'made',
 'go',
 'get',
 'The',
 'minute',
 'let',
 'skin',
 'Then',
 'begin',
 'make',
 'better']

---

In [19]:
# 詞幹提取
ps = PorterStemmer()
tokens_stem = [ps.stem(word) for word in tokens]; tokens_stem



['hey',
 'jude',
 'dont',
 'make',
 'bad',
 'take',
 'sad',
 'song',
 'make',
 'better',
 'rememb',
 'let',
 'heart',
 'then',
 'start',
 'make',
 'better',
 'hey',
 'jude',
 'dont',
 'afraid',
 'you',
 'made',
 'go',
 'get',
 'the',
 'minut',
 'let',
 'skin',
 'then',
 'begin',
 'make',
 'better']

In [20]:
# 詞形還原
lemmatizer = WordNetLemmatizer()
tokens_lemm = [lemmatizer.lemmatize(word) for word in tokens]; tokens_lemm

['Hey',
 'Jude',
 'dont',
 'make',
 'bad',
 'Take',
 'sad',
 'song',
 'make',
 'better',
 'Remember',
 'let',
 'heart',
 'Then',
 'start',
 'make',
 'better',
 'Hey',
 'Jude',
 'dont',
 'afraid',
 'You',
 'made',
 'go',
 'get',
 'The',
 'minute',
 'let',
 'skin',
 'Then',
 'begin',
 'make',
 'better']