In [34]:
import pandas as pd
# 讀取training & test 資料
train_data = pd.read_csv("https://raw.githubusercontent.com/cblancac/SentimentAnalysisBert/main/data/train_150k.txt", sep="\t", names=["feeling", "text"])
test_data = pd.read_csv("https://raw.githubusercontent.com/cblancac/SentimentAnalysisBert/main/data/test_62k.txt", sep="\t", names=["feeling", "text"])

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
# 資料雜質過濾
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
negation_list = ["not", "no", "never"]
def preprocess_text(text):
    # 全部小寫
    text = text.lower()
    # 去除標點符號
    text = text.translate(str.maketrans("", "", string.punctuation))
    # 分詞
    words = word_tokenize(text)
    # 去除無相關字
    words = [w for w in words if not w in stop_words]
    # 否定語法代換
    new_words = []
    negation = False
    for word in words:
        if word in negation_list:
            negation = True
        elif negation:
            word = "not_" + word
            negation = False
        new_words.append(word)
    return " ".join(words)
train_data["text"] = train_data["text"].apply(preprocess_text)
test_data["text"] = test_data["text"].apply(preprocess_text)

from sklearn.feature_extraction.text import CountVectorizer
# 將資料轉為詞袋
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data["text"])
X_test = vectorizer.transform(test_data["text"])

from sklearn.linear_model import LogisticRegression
# 創建並訓練邏輯迴歸分類器 (Logistic Regression)
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, train_data["feeling"])
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(test_data["feeling"], y_pred_lr))

from sklearn.naive_bayes import MultinomialNB
# 創建貝氏分類器
nb = MultinomialNB()
# 定義需要嘗試的貝氏alpha值
param_grid = {"alpha":  [2.066, 2.067, 2.068]}
from sklearn.model_selection import GridSearchCV
# 使用網格搜索最佳貝氏alpha值
grid_search = GridSearchCV(nb, param_grid, cv=5)
grid_search.fit(X_train, train_data["feeling"])
# 输出最佳貝氏alpha值 
#print(grid_search.best_params_) #2.067

# 創建並訓練貝氏分類並使用最佳貝氏alpha值
nb_best = MultinomialNB(alpha=2.067)
nb_best.fit(X_train, train_data["feeling"])
y_pred_nb = nb_best.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(test_data["feeling"], y_pred_nb))

from sklearn.ensemble import VotingClassifier
# 創建並結合貝式與邏輯分類器的投票分類器
voting_clf = VotingClassifier(estimators=[("lr", lr), ("nb", nb_best)], voting="soft")
# 訓練投票分類器
voting_clf.fit(X_train, train_data["feeling"])
y_pred_voting = voting_clf.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(test_data["feeling"], y_pred_voting))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#輸出模型參考值
print("Precision:", precision_score(test_data["feeling"], y_pred_voting, average="macro"))
print("Recall:", recall_score(test_data["feeling"], y_pred_voting, average="macro"))
print("F measure:", f1_score(test_data["feeling"], y_pred_voting, average="macro"))
print("Accuracy:", accuracy_score(test_data["feeling"], y_pred_voting))

#輸出分類檔
classification_output = [(str(y_pred_voting[i]) + ' ' + str(test_data["text"][i]) + '\n') for i in range(len(y_pred_voting))]
fp = open("/content/drive/MyDrive/My Drive/Colab Notebooks/自然語言處理與文件探勘/test_output.txt", "w")
fp.writelines(classification_output)
fp.close()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Logistic Regression Accuracy: 0.7674763702054905
Naive Bayes Accuracy: 0.7590244846607955
Voting Classifier Accuracy: 0.7686215684376916
Precision: 0.7691114352180215
Recall: 0.7686420346673658
F measure: 0.7685252012008035
Accuracy: 0.7686215684376916
