In [1]:
import pandas as pd

# 讀取 CSV 檔案
file_path = "C:/Users/USER/Downloads/NLP-Courses/NLP220/Assignments/processed_books_data.csv"
df = pd.read_csv(file_path)

# 檢視前五筆資料
print(df.head())
# 篩選只包含 1, 2, 4, 5 星評分的資料
df_filtered = df[df['review/score'].isin([1, 2, 4, 5])]

# 新增 'label' 欄位，根據評分來分類
df_filtered['label'] = df_filtered['review/score'].apply(lambda x: 1 if x >= 4 else 0)

# 檢視結果
print(df_filtered[['review/score', 'label']].head())
# 檢查正向和負向評論的分布情況
print(df_filtered['label'].value_counts())
import sklearn
from sklearn.model_selection import train_test_split

# 分割資料集，85% 用於訓練，15% 用於測試
train_data, test_data = train_test_split(df_filtered, test_size=0.15, random_state=42, stratify=df_filtered['label'])

# 檢查分割後的資料大小
print("訓練集大小:", train_data.shape)

print("測試集大小:", test_data.shape)
from sklearn.feature_extraction.text import CountVectorizer
# 移除 'review/text' 欄位中為 NaN 的列
train_data = train_data.dropna(subset=['review/text'])
test_data = test_data.dropna(subset=['review/text'])
vectorizer_count = CountVectorizer(stop_words='english', max_features=100)
X_train_count = vectorizer_count.fit_transform(train_data['review/text'])
X_test_count = vectorizer_count.transform(test_data['review/text'])
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer_tfidf.fit_transform(train_data['review/text'])
X_test_tfidf = vectorizer_tfidf.transform(test_data['review/text'])
train_data['combined_text'] = train_data['review/summary'] + " " + train_data['review/text']
test_data['combined_text'] = test_data['review/summary'] + " " + test_data['review/text']
# 填補 NaN 值
train_data['combined_text'] = train_data['combined_text'].fillna('')
test_data['combined_text'] = test_data['combined_text'].fillna('')

# 使用 TfidfVectorizer 處理
vectorizer_combined = TfidfVectorizer(stop_words='english', max_features=100)
X_train_combined = vectorizer_combined.fit_transform(train_data['combined_text'])
X_test_combined = vectorizer_combined.transform(test_data['combined_text'])
from joblib import parallel_backend
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# 建立 SVC 模型
svc_model_count = SVC(kernel='linear')  # 如果你使用非線性核如 'rbf'，計算會更慢
svc_model_tfidf = SVC(kernel='linear')
svc_model_combined = SVC(kernel='linear')



           Id                           Title  Price         User_id  \
0  1882931173  Its Only Art If Its Well Hung!    NaN   AVCGYZL8FQQTD   
1  0826414346        Dr. Seuss: American Icon    NaN  A30TK6U7DNS82R   
2  0826414346        Dr. Seuss: American Icon    NaN  A3UH4UZ4RSVO82   
3  0826414346        Dr. Seuss: American Icon    NaN  A2MVUWT453QH61   
4  0826414346        Dr. Seuss: American Icon    NaN  A22X4XUPKF66MR   

                          profileName review/helpfulness  review/score  \
0               Jim of Oz "jim-of-oz"                7/7           4.0   
1                       Kevin Killian              10/10           5.0   
2                        John Granger              10/11           5.0   
3  Roy E. Perry "amateur philosopher"                7/7           4.0   
4     D. H. Richards "ninthwavestore"                3/3           4.0   

   review/time                                   review/summary  \
0    940636800           Nice collection of Julie Strai

In [None]:
# 訓練與預測函數
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    # 訓練模型
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 預測
    start_time = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_time

    # 計算評估指標
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, f1, cm, train_time, inference_time

# 標籤資料
y_train = train_data['label']
y_test = test_data['label']

# 使用 joblib 進行多核並行運算
with parallel_backend('threading', n_jobs=-1):  # 使用所有可用的 CPU 核心
    # 訓練與評估 Count Vectorizer 特徵
    acc_count, f1_count, cm_count, train_time_count, inference_time_count = train_and_evaluate(svc_model_count, X_train_count, X_test_count, y_train, y_test)

    # 訓練與評估 Tfidf 特徵
    acc_tfidf, f1_tfidf, cm_tfidf, train_time_tfidf, inference_time_tfidf = train_and_evaluate(svc_model_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test)

    # 訓練與評估 Combined 特徵
    acc_combined, f1_combined, cm_combined, train_time_combined, inference_time_combined = train_and_evaluate(svc_model_combined, X_train_combined, X_test_combined, y_train, y_test)

# 結果輸出
print("Count Vectorizer - Accuracy: {:.4f}, F1: {:.4f}, Training time: {:.4f}s, Inference time: {:.4f}s".format(acc_count, f1_count, train_time_count, inference_time_count))
print("Tfidf Vectorizer - Accuracy: {:.4f}, F1: {:.4f}, Training time: {:.4f}s, Inference time: {:.4f}s".format(acc_tfidf, f1_tfidf, train_time_tfidf, inference_time_tfidf))
print("Combined Features - Accuracy: {:.4f}, F1: {:.4f}, Training time: {:.4f}s, Inference time: {:.4f}s".format(acc_combined, f1_combined, train_time_combined, inference_time_combined))

print("Confusion Matrix for Count Vectorizer:\n", cm_count)
print("Confusion Matrix for Tfidf Vectorizer:\n", cm_tfidf)
print("Confusion Matrix for Combined Features:\n", cm_combined)


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

# 讀取 CSV 檔案
file_path = "C:/Users/USER/Downloads/NLP-Courses/NLP220/Assignments/processed_books_data.csv"
df = pd.read_csv(file_path)

# 篩選只包含 1, 2, 4, 5 星評分的資料
df_filtered = df[df['review/score'].isin([1, 2, 4, 5])]

# 新增 'label' 欄位，根據評分來分類：4或5星為正面(1)，1或2星為負面(0)
df_filtered['label'] = df_filtered['review/score'].apply(lambda x: 1 if x >= 4 else 0)

# 移除 'review/text' 欄位中為 NaN 的列
df_filtered = df_filtered.dropna(subset=['review/text'])

# 分割資料集，85% 用於訓練，15% 用於測試
train_data, test_data = train_test_split(df_filtered, test_size=0.15, random_state=42, stratify=df_filtered['label'])

# 使用 CountVectorizer 進行文本轉換
vectorizer_count = CountVectorizer(stop_words='english', max_features=1000)
X_train_count = vectorizer_count.fit_transform(train_data['review/text'])
X_test_count = vectorizer_count.transform(test_data['review/text'])

# 自定義的 Naive Bayes 分類器
class MyNaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.class_priors = None
        self.feature_log_prob = None
        self.classes = None
    
    def fit(self, X, y):
        # 取得唯一的類別標籤
        self.classes = np.unique(y)
        # 計算每個類別的先驗機率 P(y)
        self.class_priors = np.zeros(len(self.classes))
        for idx, c in enumerate(self.classes):
            self.class_priors[idx] = np.sum(y == c) / len(y)
        
        # 計算條件機率 P(x|y) 並使用拉普拉斯平滑
        feature_count = np.zeros((len(self.classes), X.shape[1]))
        for idx, c in enumerate(self.classes):
            feature_count[idx, :] = np.sum(X[y == c], axis=0)
        
        # 使用 alpha 進行平滑
        self.feature_log_prob = np.log((feature_count + self.alpha) / 
                                       (feature_count.sum(axis=1, keepdims=True) + self.alpha * X.shape[1]))
    
    def predict(self, X):
        log_probs = np.log(self.class_priors) + X @ self.feature_log_prob.T
        return self.classes[np.argmax(log_probs, axis=1)]


# 使用 sklearn 的 MultinomialNB 模型
sklearn_nb = MultinomialNB()
sklearn_nb.fit(X_train_count, train_data['label'])
sklearn_preds = sklearn_nb.predict(X_test_count)

# 使用自定義 Naive Bayes 模型
my_nb = MyNaiveBayes()
my_nb.fit(X_train_count.toarray(), train_data['label'].values)
my_preds = my_nb.predict(X_test_count.toarray())

# 計算並顯示結果
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    cm = confusion_matrix(y_true, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} F1 Score: {f1}")
    print(f"{model_name} Confusion Matrix:\n{cm}\n")

# 評估 sklearn 和自定義模型的表現
evaluate_model(test_data['label'], sklearn_preds, "Sklearn Naive Bayes")
evaluate_model(test_data['label'], my_preds, "My Naive Bayes")


Sklearn Naive Bayes Accuracy: 0.8747180439717862
Sklearn Naive Bayes F1 Score: 0.7320381100061164
Sklearn Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28549 330394]]

My Naive Bayes Accuracy: 0.8747059037768147
My Naive Bayes F1 Score: 0.7320060502466966
My Naive Bayes Confusion Matrix:
[[ 29859  23053]
 [ 28550 330393]]



In [8]:
Sklearn Naive Bayes Accuracy: 0.8747180439717862
Sklearn Naive Bayes F1 Score: 0.7320381100061164
Sklearn Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28549 330394]]

My Naive Bayes Accuracy: 0.8747180439717862
My Naive Bayes F1 Score: 0.7320381100061164
My Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28549 330394]]
======================================================
Sklearn Naive Bayes Accuracy: 0.8747180439717862
Sklearn Naive Bayes F1 Score: 0.7320381100061164
Sklearn Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28549 330394]]

My Naive Bayes Accuracy: 0.8747204720107805
My Naive Bayes F1 Score: 0.7320412723898135
My Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28548 330395]]
=========================================================
Sklearn Naive Bayes Accuracy: 0.8747180439717862
Sklearn Naive Bayes F1 Score: 0.7320381100061164
Sklearn Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28549 330394]]

My Naive Bayes Accuracy: 0.8747204720107805
My Naive Bayes F1 Score: 0.7320412723898135
My Naive Bayes Confusion Matrix:
[[ 29863  23049]
 [ 28548 330395]]

SyntaxError: invalid syntax (1536368558.py, line 1)