In [7]:
import pandas as pd
import numpy as np 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords       #語料庫
from nltk import WordNetLemmatizer      #詞型還原，去掉贅詞
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import wordnet as wn   #針對英語的詞彙資料庫，將詞彙組織成詞集（一組同義詞）並提供它們之間的詞彙關係
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection,naive_bayes,svm
from sklearn.metrics import accuracy_score




如果腳本保持一致，則每次運行都會重現相同的結果，否則每次運行都會產生不同的結果。種子可以設置為任意數字。

# STEP -2: Set random seed

In [8]:
np.random.seed(500)

# STEP -3: Add the Corpus

In [9]:
df = pd.read_csv(r"C:\Users\yifun\Desktop\python\corpus.csv",encoding='latin-1')
df

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2
...,...,...
9995,A revelation of life in small town America in...,__label__2
9996,Great biography of a very interesting journal...,__label__2
9997,Interesting Subject; Poor Presentation: You'd...,__label__1
9998,Don't buy: The box looked used and it is obvi...,__label__1


# STEP -4: Data pre-processing

In [10]:
df['text'].dropna(inplace=True)     #移除空的列
df['text'] = [i.lower() for i in df['text']]   #將每列的字都變成小寫
df['text'] = [word_tokenize(i) for i in df['text']]   #Tokenization: 將每個字做分詞化(各自變成一個單字)
df  

Unnamed: 0,text,label
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2
...,...,...
9995,"[a, revelation, of, life, in, small, town, ame...",__label__2
9996,"[great, biography, of, a, very, interesting, j...",__label__2
9997,"[interesting, subject, ;, poor, presentation, ...",__label__1
9998,"[do, n't, buy, :, the, box, looked, used, and,...",__label__1


In [11]:
#wordnet 使用範例
'''
from collections import defaultdict
from nltk.corpus import wordnet as wn

tag_map = defaultdict(lambda: wn.NOUN)    #建立一個字典，目的是將詞性標記（Part-of-Speech tags）映射到 WordNet 中對應的詞性類別。
                                          #預設情況下，tag_map 的預設值被設置為 wn.NOUN，表示詞性標記沒有被明確指定時，預設為名詞（Noun）。
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

word = 'run'
tag = 'V'  # Verb tag

synsets = wn.synsets(word, tag_map[tag])   #把這個字加進去並標註詞性
for synset in synsets:
    print(synset.name(), synset.definition())

>>run.v.01 move fast by using one's feet, with one foot off the ground at any given time
scat.v.01 flee; take to one's heels; cut and run
run.v.03 stretch out over a distance, space, time, or scope; run or extend between two points or beyond a certain point
operate.v.01 direct or control; projects, businesses, etc.
run.v.05 have a particular form
run.v.06 move along, of liquids
function.v.01 perform as expected when applied
range.v.01 change or be different within limits
campaign.v.01 run, stand, or compete for an office or a position
play.v.18 cause to emit recorded audio or video
run.v.11 move about freely and without restraint, or act as if running around in an uncontrolled way
tend.v.01 have a tendency or disposition to do or be something; be inclined
run.v.13 be operating, running or functioning
run.v.14 change from one state to another
run.v.15 cause to perform
run.v.16 be affected by; be subjected to
prevail.v.03 continue to exist
run.v.18 occur persistently
run.v.19 carry out a process or program, as on a computer or a machine
carry.v.15 include as the content; broadcast or publicize
run.v.21 carry out
.
.
.
'''

"\nfrom collections import defaultdict\nfrom nltk.corpus import wordnet as wn\n\ntag_map = defaultdict(lambda: wn.NOUN)    #建立一個字典，目的是將詞性標記（Part-of-Speech tags）映射到 WordNet 中對應的詞性類別。\n                                          #預設情況下，tag_map 的預設值被設置為 wn.NOUN，表示詞性標記沒有被明確指定時，預設為名詞（Noun）。\ntag_map['J'] = wn.ADJ\ntag_map['V'] = wn.VERB\ntag_map['R'] = wn.ADV\n\nword = 'run'\ntag = 'V'  # Verb tag\n\nsynsets = wn.synsets(word, tag_map[tag])   #把這個字加進去並標註詞性\nfor synset in synsets:\n    print(synset.name(), synset.definition())\n\n>>run.v.01 move fast by using one's feet, with one foot off the ground at any given time\nscat.v.01 flee; take to one's heels; cut and run\nrun.v.03 stretch out over a distance, space, time, or scope; run or extend between two points or beyond a certain point\noperate.v.01 direct or control; projects, businesses, etc.\nrun.v.05 have a particular form\nrun.v.06 move along, of liquids\nfunction.v.01 perform as expected when applied\nrange.v.01 change or be different wit

In [12]:
from nltk import pos_tag   #詞性標籤(part-of-speech tagging)，對單詞的詞性進行標記，標記後的結果是二元數組格式
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(df['text']):   # 於將一個可遍歷的數據對象(如列表、元組或字符串)組合為一個索引序列
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()    #建立WordNetLemmatizer物件，對詞彙進行詞幹還原
    for word,tag in pos_tag(entry):          #循環遍歷每個單詞及其對應的詞性標籤，從pos_tag(entry)獲取
        if word not in stopwords.words('english') and word.isalpha():   #檢查該單詞是否不在常用英文停用詞列表中（停用詞是NLP常過濾掉的常用詞），且是否僅包含字母
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]]) #若條件滿足，使用WordNetLemmatizer物件和tag_map(字典)中對應的詞性標籤對單詞進行詞幹還原
                                                                         #(只有單字詞性標籤是ADJ、VERB、ADV 才會使用相應的詞性標籤進行詞幹還原)
                                                                         #tag[0]的目的是獲取詞性標籤的第一個字母，以便將其用於tag_map字典中查找相應的WordNet詞性標籤
            Final_words.append(word_Final)
    df.loc[index,'text_final'] = str(Final_words)    #將Final_words 轉成str 並存於dataframe中

In [13]:
df

Unnamed: 0,text,label,text_final
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2,"['stun', 'even', 'sound', 'track', 'beautiful'..."
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2,"['best', 'soundtrack', 'ever', 'anything', 're..."
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2,"['amaze', 'soundtrack', 'favorite', 'music', '..."
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2,"['excellent', 'soundtrack', 'truly', 'like', '..."
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2,"['remember', 'pull', 'jaw', 'floor', 'hear', '..."
...,...,...,...
9995,"[a, revelation, of, life, in, small, town, ame...",__label__2,"['revelation', 'life', 'small', 'town', 'ameri..."
9996,"[great, biography, of, a, very, interesting, j...",__label__2,"['great', 'biography', 'interesting', 'journal..."
9997,"[interesting, subject, ;, poor, presentation, ...",__label__1,"['interest', 'subject', 'poor', 'presentation'..."
9998,"[do, n't, buy, :, the, box, looked, used, and,...",__label__1,"['buy', 'box', 'look', 'use', 'obviously', 'ne..."


# STEP -5: Prepare Train and Test Data sets

In [14]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'],df['label'],test_size=0.3)  
                                                                      #特徵數據        #目標(標籤數據)

# STEP -6: Encoding

In [15]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)   #對目標變量進行編碼，將字串轉成數值
Test_Y = Encoder.fit_transform(Test_Y)

# STEP -7: Word Vectorization

In [16]:
Tfidf_vect = TfidfVectorizer(max_features=5000)    #Term Frequency - Inverse Document Frequency (詞頻/逆向文件頻率)  反映出文字對於文件的重要性，建立一個只選擇最常見的 5000 個單詞作為特徵的模型
Tfidf_vect.fit(df['text_final'])                #丟入資料進入模型中，將文本轉成向量
Train_X_Tfidf = Tfidf_vect.transform(Train_X)  #將 Train_X 中的每個文本句子轉換為 TF-IDF 特徵向量。每個特徵向量都表示了該句子中每個單詞的 TF-IDF 權重
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#將文本數據轉換為數值特徵表示，以便於機器學習模型進行訓練。轉換後的特徵向量可以作為輸入餵給機器學習模型進行訓練，
# 從而將文本數據與標籤關聯起來，並學習出預測模型。

In [17]:
print(Tfidf_vect.vocabulary_)   #查看此模型中從語料庫中，學習到的詞彙



In [18]:
print(Train_X_Tfidf)   #向量化後的數據
'''
舉例: (0,4502)  0.37634188677099956

0: 代表是Train_X_Tfidf 的第一列資料
4502: 表示第一列中每個單字的唯一整數編號
0.37 :表示通過 TF-IDF Vectorizer 計算的分數
'''

  (0, 4502)	0.37634188677099956
  (0, 4501)	0.1502086671688917
  (0, 3983)	0.35870975205557054
  (0, 3897)	0.25152943577361386
  (0, 3864)	0.2690840463105974
  (0, 3750)	0.3469774999759746
  (0, 3665)	0.28971770688512954
  (0, 3571)	0.29440491517773787
  (0, 2931)	0.22969709983777647
  (0, 1944)	0.13398240399394393
  (0, 1528)	0.17762585383071805
  (0, 514)	0.3210759641783664
  (0, 485)	0.1230432680090133
  (0, 235)	0.24487094004433968
  (1, 4694)	0.36974013511943044
  (1, 4073)	0.6167222431544791
  (1, 3441)	0.367922932130556
  (1, 2588)	0.3755181501193181
  (1, 1250)	0.3587203442870721
  (1, 593)	0.27907786873623097
  (2, 4741)	0.18369761701331289
  (2, 4627)	0.1499525624356807
  (2, 4463)	0.10285168719742008
  (2, 4198)	0.11682860303877614
  (2, 3856)	0.23042292688427712
  :	:
  (6998, 2566)	0.12648150924495427
  (6998, 2515)	0.11841716778929348
  (6998, 2124)	0.14040672300699006
  (6998, 1977)	0.0732676023027426
  (6998, 1791)	0.22643036986644238
  (6998, 1755)	0.2050525986673158
 

# STEP -8: Use the ML Algorithms to Predict the outcome

X :  Train、Test X將句子轉換為TF-IDF 特徵向量

Y :  Train、Test y將label轉為數值去進行編碼

In [19]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)    #將訓練集資料餵入模型訓練
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)  #將訓練好的模型拿來預測測試集的X
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)  #將預測值與實際標籤進行比對，並計算出預測準確的樣本佔總樣本數的比例

Naive Bayes Accuracy Score ->  83.16666666666667


In [20]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')  # c: 懲罰參數，Penalty Parameter，參數控制著分類器的正規化（regularization）程度
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  84.8
