In [43]:
import os
import math
from collections import defaultdict, Counter

### 1. reading data

In [None]:
# 1. 讀取 training.txt 
def load_training_data(training_file):
    docs = []
    labels = []
    train_doc_ids = set()
    with open(training_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            class_id = int(parts[0])  # 類別 ID
            doc_ids = map(int, parts[1:])  # 該類別的文檔 ID
            train_doc_ids.update(doc_ids)  # 將文檔 ID 加入集合
            for doc_id in doc_ids:
                docs.append(str(doc_id))  # 將文檔 ID 視為文字
                labels.append(class_id)
    return docs, labels, train_doc_ids



#讀取 test 資料
def load_test_data(test_folder):
    docs = []
    doc_ids = []
    for file_name in sorted(os.listdir(test_folder), key=lambda x: int(x.split(".")[0])):
        if file_name.endswith(".txt"):
            doc_id = int(file_name.split(".")[0])
            file_path = os.path.join(test_folder, file_name)
            with open(file_path, 'r', encoding="utf-8") as file:
                content = file.read().strip()
                docs.append(content)
                doc_ids.append(doc_id)
    return docs, doc_ids


### 2. 計算 TF-IDF


In [45]:
def compute_tf_df(docs):
    tf_list = []
    df = defaultdict(int)
    for doc in docs:
        terms = doc.split()
        tf = Counter(terms)
        tf_list.append(tf)
        for term in tf.keys():
            df[term] += 1
    return tf_list, df

### 3. Chi-Square 特徵選擇

In [46]:
def chi_square_selection(docs, labels, df, num_features=500):
    class_term_count = defaultdict(lambda: defaultdict(int))
    class_doc_count = defaultdict(int)
    total_docs = len(docs)

    # 統計每個類別中的詞頻
    for doc, label in zip(docs, labels):
        terms = set(doc.split())
        class_doc_count[label] += 1
        for term in terms:
            class_term_count[label][term] += 1

    # 計算 Chi-Square 分數
    chi_scores = defaultdict(float)
    for term in df.keys():
        for class_id in class_doc_count.keys():
            A = class_term_count[class_id][term]
            B = sum(class_term_count[c][term] for c in class_doc_count if c != class_id)
            C = class_doc_count[class_id] - A
            D = total_docs - (A + B + C)

            numerator = total_docs * (A * D - B * C) ** 2
            denominator = (A + C) * (B + D) * (A + B) * (C + D)
            if denominator > 0:
                chi_scores[term] += numerator / denominator

    # 選擇最高分的詞
    selected_features = sorted(chi_scores, key=chi_scores.get, reverse=True)[:num_features]
    return selected_features

### 4. 訓練 Multinomial Naive Bayes

In [47]:
def train_naive_bayes(docs, labels, selected_features):
    class_term_count = defaultdict(lambda: defaultdict(int))
    class_doc_count = defaultdict(int)
    vocab_size = len(selected_features)
    total_docs = len(docs)

    # 計算每個類別的詞頻與文檔數
    for doc, label in zip(docs, labels):
        terms = doc.split()
        class_doc_count[label] += 1
        for term in terms:
            if term in selected_features:
                class_term_count[label][term] += 1

    # 計算條件機率與先驗機率
    class_priors = {c: count / total_docs for c, count in class_doc_count.items()}
    class_word_probs = defaultdict(dict)
    for class_id, term_count in class_term_count.items():
        total_terms = sum(term_count.values())
        for term in selected_features:
            class_word_probs[class_id][term] = (term_count[term] + 1) / (total_terms + vocab_size)

    return class_priors, class_word_probs


### 5. 測試 Multinomial Naive Bayes

In [48]:
def predict_naive_bayes(docs, class_priors, class_word_probs, selected_features):
    predictions = []
    for doc in docs:
        terms = doc.split()
        class_scores = {}
        for class_id, prior in class_priors.items():
            score = math.log(prior)
            for term in terms:
                if term in selected_features:
                    score += math.log(class_word_probs[class_id].get(term, 1 / (len(selected_features) + 1)))
            class_scores[class_id] = score
        if class_scores:
            predictions.append(max(class_scores, key=class_scores.get))
        else:
            # 若無法計算分數，則預設為類別 1（或其他邏輯）
            predictions.append(1)
    return predictions

### main

In [None]:
if __name__ == "__main__":
    # 設定路徑
    training_file = "train_docs/training.txt"
    test_folder = "test_docs"

    # 1. 載入訓練資料與測試資料
    train_docs, train_labels, train_doc_ids = load_training_data(training_file)
    test_docs, test_doc_ids = load_test_data(test_folder)

    # **排除已經出現在 training.txt 的文檔**
    filtered_test_docs = []
    filtered_test_doc_ids = []
    for doc, doc_id in zip(test_docs, test_doc_ids):
        if doc_id not in train_doc_ids:  # 如果文檔 ID 不在訓練數據中
            filtered_test_docs.append(doc)
            filtered_test_doc_ids.append(doc_id)

    # 2. 計算詞頻與特徵選擇
    tf_list, df = compute_tf_df(train_docs)
    selected_features = chi_square_selection(train_docs, train_labels, df)

    # 3. 訓練模型
    class_priors, class_word_probs = train_naive_bayes(train_docs, train_labels, selected_features)

    # 4. 測試模型
    predictions = predict_naive_bayes(filtered_test_docs, class_priors, class_word_probs, selected_features)

    # 5. 輸出結果
    with open("submission.csv", "w") as f:
        f.write("doc_id,class_id\n")
        for doc_id, pred in zip(filtered_test_doc_ids, predictions):
            f.write(f"{doc_id},{pred}\n")
    print("Results saved to submission.csv")


Results saved to submission.csv
