In [4]:
# 簡單的正向和負向詞彙計數
pos_counter = {"good": 10, "great": 15, "excellent": 7, "love": 8, "amazing": 6}
neg_counter = {"terrible": 12, "bad": 14, "awful": 9, "hate": 10, "horrible": 8}

# 範例評論
review = "This crib was terrible"

# 預設正向和負向的比例
percent_pos = 0.5
percent_neg = 0.5

# 總正向和負向詞彙的數量
total_pos = sum(pos_counter.values())
total_neg = sum(neg_counter.values())

# 初始化機率
pos_probability = 1
neg_probability = 1

# 將評論分詞
review_words = review.lower().split()

# Naive Bayes 核心算法
for word in review_words:
    # 從正負向計數中提取單詞的出現次數（若未出現則預設為0）
    word_in_pos = pos_counter.get(word, 0)
    word_in_neg = neg_counter.get(word, 0)

    # 計算條件機率 (加入平滑處理，避免出現0概率)
    pos_probability *= (word_in_pos + 1) / (total_pos + len(pos_counter))
    neg_probability *= (word_in_neg + 1) / (total_neg + len(neg_counter))

# 最終機率
final_pos = pos_probability * percent_pos
final_neg = neg_probability * percent_neg

# 輸出分類結果
if final_pos > final_neg:
    print("The review is positive")
else:
    print("The review is negative")


The review is negative


# Formatting the Data for scikit-learn

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# 假設的正面和負面評論列表
neg_list = ["This crib was terrible", "I hate this product", "Worst experience ever"]
pos_list = ["This crib was amazing", "I love this product", "Best experience ever"]

review = "This crib was amazing"

counter = CountVectorizer()
counter.fit(neg_list + pos_list)
print(counter.vocabulary_)

review_counts = counter.transform([review])
print(review_counts.toarray())

training_counts = counter.transform(neg_list + pos_list)

{'this': 9, 'crib': 2, 'was': 10, 'terrible': 8, 'hate': 5, 'product': 7, 'worst': 11, 'experience': 4, 'ever': 3, 'amazing': 0, 'love': 6, 'best': 1}
[[1 0 1 0 0 0 0 0 0 1 1 0]]


In [21]:
from sklearn.naive_bayes import MultinomialNB

# 建立訓練資料
neg_list = ["This crib was terrible", "I hate this product", "Worst experience ever"] * 334
pos_list = ["This crib was amazing", "I love this product", "Best experience ever"] * 334

# 建立詞彙表
def build_vocab(texts):
    vocab = {}
    for sentence in texts:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

# 將文本轉換為詞頻向量
def text_to_vector(text, vocab):
    vector = [0] * len(vocab)
    for word in text.split():
        if word in vocab:
            vector[vocab[word]] += 1
    return vector

# 建立詞彙表
texts = neg_list + pos_list
vocab = build_vocab(texts)

# 將訓練資料轉換為詞頻矩陣
training_counts = [text_to_vector(sentence, vocab) for sentence in texts]

# 測試評論
review = "This crib was great amazing and wonderful"
review_vector = text_to_vector(review, vocab)

# 訓練 Naive Bayes 模型
classifier = MultinomialNB()
training_labels = [0] * len(neg_list) + [1] * len(pos_list)  # 0: 負面, 1: 正面
classifier.fit(training_counts, training_labels)

# 預測測試評論
print("預測結果:", classifier.predict([review_vector]))
print("預測機率:", classifier.predict_proba([review_vector]))


預測結果: [1]
預測機率: [[0.00297619 0.99702381]]
