In [1]:
import nltk
import re
import numpy as np
from sklearn.utils import shuffle

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup


wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# 另一個 stopwords 的來源
# from nltk.corpus import stopwords
# stopwords.words('english')

# 讀正向與負向 reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review', encoding='utf-8').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review', encoding='utf-8').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')

In [2]:
positive_reviews[:2]

[<review_text>
 I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.
 
 I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.
 
 As always, Amazon had it to me in &lt;2 business days
 </review_text>, <review_text>
 I ordered 3 APC Back-UPS ES 500s on the recommendation of an employee of mine who used to work at APC. I've had them for about a month now without any problems. They've functioned properly through a few unexpected power interruptions. I'll gladly order more if the need arises.
 
 Pros:
  - Large plug spacing, good for power adapters
  - Simple design
  - Long cord
 
 Cons:
  - No line conditioning (usually an expensive option
 </review_t

In [3]:
# 基於nltk自建 tokenizer

def get_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, 
                "N": wordnet.NOUN, 
                "V": wordnet.VERB,
                "R": wordnet. ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def my_tokenizer(s):
    s = s.lower() # downcase
    s = re.sub('[^a-z]', ' ', s)   
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t) > 3] # 去除短字
    tokens = [wordnet_lemmatizer.lemmatize(t, get_pos(t)) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens

In [4]:
# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 7438


In [5]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # 正規化數據提升未來準確度
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1


In [6]:
# shuffle data 創造 train/test splits
# 多次嘗試!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# 最後 100 列是測試用
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


In [7]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))


Train accuracy: 0.8047368421052632
Test accuracy: 0.82


In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(Xtrain, Ytrain)
print("Train accuracy:", dt.score(Xtrain, Ytrain))
print("Test accuracy:", dt.score(Xtest, Ytest))


Train accuracy: 1.0
Test accuracy: 0.72


In [9]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

ada.fit(Xtrain, Ytrain)
print("Train accuracy:", ada.score(Xtrain, Ytrain))
print("Test accuracy:", ada.score(Xtest, Ytest))

Train accuracy: 0.8115789473684211
Test accuracy: 0.76


In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50)

rf.fit(Xtrain, Ytrain)
print("Train accuracy:", rf.score(Xtrain, Ytrain))
print("Test accuracy:", rf.score(Xtest, Ytest))

Train accuracy: 1.0
Test accuracy: 0.84


In [11]:
# build index_to_word map

index_to_word = {}
for key, value in word_index_map.items():
    index_to_word[value] = key

In [12]:
sort_index = list(np.argsort(rf.feature_importances_))
sort_index.reverse()

sort_importance = []

for index in sort_index:
    sort_importance.append((index, index_to_word[index], rf.feature_importances_[index]))
    
sort_importance[:20]

[(227, 'return', 0.020806125188699284),
 (116, 'price', 0.012334816641974507),
 (1040, 'bad', 0.009347897441354945),
 (471, 'try', 0.009154045840986247),
 (1, 'this', 0.008039384042679301),
 (182, 'highly', 0.007670795725157828),
 (103, 'perfect', 0.007623075764007412),
 (1604, 'waste', 0.007410307731764408),
 (67, 'easy', 0.007379002719783689),
 (230, 'excellent', 0.007067332880954998),
 (407, 'poor', 0.00670830051265114),
 (189, 'support', 0.006612368719582323),
 (4622, 'refund', 0.0063713277329665394),
 (461, 'then', 0.0063348895024097),
 (81, 'quality', 0.006286659246516459),
 (120, 'money', 0.006045861741864559),
 (101, 'item', 0.0052201033945255305),
 (449, 'disappointed', 0.005042908510257112),
 (109, 'fast', 0.004961898594208305),
 (207, 'stop', 0.004791310176796104)]

In [13]:
# 列出每個字的正負 weight
# 用不同的 threshold values!
from future.utils import iteritems

indexs = []
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)
        indexs.append(index)

unit -0.7538078927145454
cable 0.767100494683746
time -0.726223384728383
month -0.8138183269413682
space 0.6225515661287355
sound 0.9415876965017499
easy 2.237605658770963
quality 1.4511810807116838
company -0.6567263804005102
item -1.0927078510174726
perfect 1.2293113909512634
fast 1.0859964956223818
price 2.9279483326231057
value 0.5822351320830279
money -1.1176839401008392
memory 1.0387507759065526
picture 0.6537187302855398
happy 0.6073924511806442
travel 0.5609180069083064
pretty 0.7990758582758279
pleased 0.5119059026005125
highly 1.2408898316921535
recommend 0.8304659260252012
customer -0.7753894694335149
support -1.042725745634144
little 1.1340326920537787
stop -0.8081967049624343
amaze 0.5083003367541621
worth 0.5373256673569853
sent -0.5472605966183283
return -2.8377019893601494
excellent 1.4888815795249368
extra 0.502023276829129
love 1.2204364589752805
video 0.5989701405674107
feature 0.5522186957764698
software -0.5579373573833781
home 0.6335105114296778
piece -0.661765961

In [14]:
indexs[:10]

[2, 7, 12, 36, 45, 64, 67, 81, 82, 101]

In [15]:
# 取超出 threshold 的字

Xtrain_new = Xtrain[:, indexs]
Xtest_new = Xtest[:, indexs]

rf_new = RandomForestClassifier(n_estimators=50)

rf_new.fit(Xtrain_new, Ytrain)
print("Train accuracy:", rf_new.score(Xtrain_new, Ytrain))
print("Test accuracy:", rf_new.score(Xtest_new, Ytest))

Train accuracy: 0.9752631578947368
Test accuracy: 0.8
