<font size=6> Word2Vec、Tf-Idf </font>

<font size=3> 
 利用
 <font size=3 color=orange> Word2Vec、Tf-Idf </font>
 萃取評論向量後，輸入
 <font size=3 color=orange> 隨機森林 </font>
 以預測餐廳評分 (0, 1) </font> <br/>
<font size=3> 評分為 1 ~ 5，將 4 以上的評分轉為 1，其餘轉為0 </font>

In [None]:
# data preprocess
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
my_col = ['stars', 'text']
data = pd.read_csv('yelp.csv', usecols=my_col)
data['stars'] = data['stars'].apply(lambda x : 0 if x < 4 else 1)
# data_Y = np.where(data['stars']>=4, 1, 0 )
data['text'] = data['text'].str.lower()
text_list = []
for sentence in data['text']:
    sent = re.sub('[?|.|!|\n]', ' ', sentence )
    text_list.append(sent)
data['text'] = text_list
data.head()

In [None]:
# 拆分訓練、測試
def k_fold(k, data):
    # 2500
    sub_size = 2500
    # start、end 表示從哪開始切、切到哪
    start = k * sub_size
    end = start + sub_size
    test = data.iloc[start:end]
    train_1 = data.iloc[:start]
    train_2 = data.iloc[end:len(data)]
    train = pd.concat( [train_1, train_2], axis=0, ignore_index=True )
    
    test_X = test['text']
    test_Y = test['stars']
    train_X = train['text']
    train_Y = train['stars']
    # 回傳的資料型態皆被轉成 Series
    return train_X, train_Y, test_X, test_Y

# 文字轉為詞向量
def trans_word_vec(X):
    model = Word2Vec.load("w2v.model")
    rev_vec_list = np.zeros(400, dtype = float)
    for comment in X:
        vectors_list = []
        sum_list = np.zeros(400, dtype = float)
        # 將一篇評論裡每個詞的詞向量存到 vectors_list
        # 避免遇到 error，將一篇評論跑完才平均向量
        for word in comment:
            try:
                word_vec = model.wv[word]
                vectors_list.append(word_vec)
            except KeyError:
                continue
        # sum_list 會是 1*400 的陣列，代表一篇評論的向量
        for i in range( len(vectors_list) ):
            temp = vectors_list[i].copy()
            sum_list += temp
        if (len(vectors_list) != 0):
            sum_list /= len(vectors_list)
        # 每次算完一篇評論向量，將其加到 rev_vec_list。最後丟掉隨機森林預測 Y
        rev_vec_list = np.vstack( (rev_vec_list, sum_list ))

    rev_vec_list = np.delete(rev_vec_list, 0, axis=0)
    # X_vec = pd.DataFrame(rev_vec_list)
    return rev_vec_list

<font size=5> Tf-Idf </font>

<font size=3> 
 隨機森林主要可調整的參數為 max_features、n_estimators、min_sample_leaf <br/>
 max_features : 單一決策樹使用的最大特徵數量。分為 "None"、"sqrt"(最大特徵數開根號)、"比例"(如 0.2表示使用 20%) <br/>
 n_estimators : 子樹數量。在電腦可承受的範圍，越多越好 <br/>
 min_sample_leaf : leaf (決策樹的末端節點) 包含的樣本數。太低會overfit <br/>
 超參數之實驗結果在最後
</font>

In [None]:
# Tf -Idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# 有 28880個字，找出現最多的 15000 個
tv = TfidfVectorizer(stop_words='english', max_features = 15000)
# sub_size = 2500
avg_acc = 0
k = 4
sub_size = int(len(data) / k)
for i in range(k):
    # 分割完train、test後, 跑 tf - idf 模型
    train_X, train_Y, test_X, test_Y = k_fold(i, data)
    train_X = tv.fit_transform(train_X)
    test_X = tv.transform(test_X)
    # 使用隨機森林預測結果
    forest = RandomForestClassifier(max_features='sqrt', random_state=36)
    forest.fit(train_X, train_Y)
    pred_Y = forest.predict(test_X)
    acc = forest.score(test_X, test_Y)
    avg_acc += acc
    print( '{} th, accuracy is {}'.format(i+1, acc) )

print('Average accuracy is ', round(avg_acc/k, 3))

<font size=5> Word2Vec </font>

In [None]:
# Word2Vec
from gensim.models import Word2Vec

vectorizer = CountVectorizer(stop_words='english')
stopwords = vectorizer.get_stop_words()
# 將評論 split 之後，再把非停頓詞 join。得到去除停頓詞後的評論
data['text'] = data['text'].apply(lambda x: ' '.join( [word for word in x.split(' ') if word not in stopwords] ))
train_X, train_Y, test_X, test_Y = k_fold(0, data)
model = Word2Vec(sentences=train_X, vector_size=400, window=5, min_count=2, epochs=10, workers=3)
model.save("w2v.model")
train_X_vec = trans_word_vec(train_X)
test_X_vec = trans_word_vec(test_X)

forest = RandomForestClassifier(max_features='sqrt', random_state=36)
forest.fit(train_X_vec, train_Y)
pred_Y = forest.predict(test_X_vec)
acc = forest.score(test_X_vec, test_Y)
print(' 準確率是 ', acc)

<font size=5> Hyperparameter </font>

<font size=3>
 random_state = [20, 25, 29, 30, 36, 39, 49, 50, 55, 66, 69]。55 is the best <br/>
 min_sample_leaf = [1, 5, 10, 20, 30, 50, 75, 100]。1 is the best, also is default <br/>
 criterion : 'entropy'(0.795) 略好於 'gini'(0.792)。多 0.003，每個 fold 的結果也比較 tight
</font>