In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 加載訓練數據
train_df = pd.read_csv(r'C:\Users\USER\Downloads\aclImdb_v1\aclImdb\train\train_reviews.csv')
test_df = pd.read_csv(r"C:\Users\USER\Downloads\aclImdb_v1\aclImdb\train\test_reviews.csv")

# 使用 train_test_split 將訓練數據進一步拆分為訓練集和驗證集
X_train, X_val, y_train, y_val = train_test_split(train_df['review_text'], train_df['label'], test_size=0.1, random_state=42)

# 使用 TfidfVectorizer 提取 n-gram 特徵
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')  # 使用 bigrams, 移除停用詞
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_df['review_text'])

# 訓練模型並驗證
best_params = {
    "Naive Bayes": {'alpha': 0.05450760213833489},
    "Logistic Regression": {'C': 89.67647856745852, 'solver': 'saga', 'max_iter': 1000},
    "Decision Tree": {'max_depth': 20, 'min_samples_split': 3},
    "Random Forest": {'n_estimators': 189, 'max_depth': 18, 'min_samples_split': 6},
    "KNN": {'n_neighbors': 7}
}

models = {
    "Naive Bayes": MultinomialNB(**best_params["Naive Bayes"]),
    "Logistic Regression": LogisticRegression(**best_params["Logistic Regression"]),
    "Decision Tree": DecisionTreeClassifier(**best_params["Decision Tree"]),
    "Random Forest": RandomForestClassifier(**best_params["Random Forest"]),
    "KNN": KNeighborsClassifier(**best_params["KNN"])
}

# 用於保存模型驗證集的準確率和最好的模型名稱
best_model_name = None
best_val_accuracy = 0

# 訓練每個模型，並輸出驗證集上的準確率
for model_name, model in models.items():
    print("model_name = " ,model_name)
    model.fit(X_train_tfidf, y_train)
    val_predictions = model.predict(X_val_tfidf)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print(f"{model_name} Validation Accuracy: {val_accuracy}")
    
    # 保存驗證集上最好的模型
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_name = model_name

# 使用驗證集表現最好的模型在測試集上進行評估
print(f"\nBest Model: {best_model_name}")
best_model = models[best_model_name]
test_predictions = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(test_df['label'], test_predictions)
print(f"{best_model_name} Test Accuracy: {test_accuracy}")


model_name =  Naive Bayes
Naive Bayes Validation Accuracy: 0.892
model_name =  Logistic Regression
Logistic Regression Validation Accuracy: 0.8944
model_name =  Decision Tree
Decision Tree Validation Accuracy: 0.7224
model_name =  Random Forest
Random Forest Validation Accuracy: 0.8492
model_name =  KNN
KNN Validation Accuracy: 0.8008

Best Model: Logistic Regression
Logistic Regression Test Accuracy: 0.88412
