In [None]:
## likelihood
import os
import math
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
from nltk.corpus import stopwords # type: ignore
import nltk # type: ignore
import numpy as np # type: ignore

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Data Preprocessing
def preprocess(text):
    """Clean, tokenize, and remove stopwords from text."""
    import re
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.split()  # Tokenize text
    filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords
    # 移除第三行
    lines = text.split('\n')
    if len(lines) > 2:
        text = '\n'.join(lines[:2] + lines[3:])
    # 移除指定單詞和停用詞
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word != 'embed']
    return ' '.join(tokens)
    return filtered_words

# Load Documents
def load_documents(folder_path):
    """Load documents from folder."""
    docs = {}
    for filename in os.listdir(folder_path):
        doc_id = int(filename.split('.')[0])  # Assume filenames are numbers
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            docs[doc_id] = preprocess(file.read())
    return docs

# Calculate TF-IDF and Reduce Vocabulary
def calculate_tfidf(docs, max_features=1000):
    """Calculate TF-IDF and select top features."""
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform([' '.join(docs[doc_id]) for doc_id in sorted(docs.keys())])
    vocab = vectorizer.get_feature_names_out()
    return vocab

# Calculate Likelihood Ratio for Feature Selection
def calculate_likelihood_ratio(docs, labels, vocab):
    """Calculate Likelihood Ratio for each term-class pair."""
    term_class_counts = Counter()
    term_counts = Counter()
    class_counts = Counter(labels)
    total_docs = len(labels)

    # Count term occurrences in documents for each class
    for i, doc in enumerate(docs):
        unique_terms = set(doc)  # Consider unique terms in each document
        for term in unique_terms:
            if term in vocab:
                term_class_counts[(term, labels[i])] += 1
                term_counts[term] += 1

    # Calculate Likelihood Ratios
    likelihood_scores = defaultdict(float)
    for term in vocab:
        for cls in class_counts.keys():
            observed = term_class_counts.get((term, cls), 0)
            expected = (term_counts[term] * class_counts[cls]) / total_docs
            if expected > 0:  # Avoid division by zero
                likelihood_scores[term] += ((observed - expected) ** 2) / expected

    # Sort terms by their likelihood scores
    return sorted(likelihood_scores, key=likelihood_scores.get, reverse=True)

# Train Multinomial Naive Bayes
def train_naive_bayes(class_docs, docs, selected_vocab):
    """Train Multinomial Naive Bayes with add-one smoothing."""
    class_counts = Counter()
    term_counts = {cls: Counter() for cls in class_docs.keys()}
    vocab_size = len(selected_vocab)

    # Count terms for each class
    for cls, doc_ids in class_docs.items():
        class_counts[cls] += len(doc_ids)
        for doc_id in doc_ids:
            for term in docs[doc_id]:
                if term in selected_vocab:
                    term_counts[cls][term] += 1

    # Calculate probabilities
    class_probs = {cls: math.log(class_counts[cls] / sum(class_counts.values())) for cls in class_counts}
    term_probs = {cls: defaultdict(float) for cls in class_counts}

    for cls in class_counts:
        total_terms = sum(term_counts[cls].values()) + vocab_size
        for term in selected_vocab:
            term_probs[cls][term] = math.log((term_counts[cls][term] + 1) / total_terms)

    return class_probs, term_probs

# Classify Documents
def classify_document(doc, selected_vocab, class_probs, term_probs):
    """Classify a single document."""
    scores = {cls: class_probs[cls] for cls in class_probs}
    for cls in class_probs:
        for term in doc:
            if term in selected_vocab:
                scores[cls] += term_probs[cls][term]
    return max(scores, key=scores.get)

# Classify Testing Documents with TF-IDF and Likelihood Ratio
def classify_and_output_with_tfidf_likelihood(training_data, docs, output_path, tfidf_features=1000, lr_features=500):
    """Classify testing documents using TF-IDF and Likelihood Ratio-based feature selection."""
    # Parse training data
    class_docs = defaultdict(list)
    labels = []
    train_doc_ids = []
    for line in training_data.strip().split("\n"):
        parts = line.split()
        cls = int(parts[0])
        docs_in_class = list(map(int, parts[1:]))
        class_docs[cls].extend(docs_in_class)
        labels.extend([cls] * len(docs_in_class))
        train_doc_ids.extend(docs_in_class)

    train_docs = [docs[doc_id] for doc_id in train_doc_ids]

    # Identify testing documents (all IDs not in training set)
    all_doc_ids = set(docs.keys())
    test_doc_ids = sorted(all_doc_ids - set(train_doc_ids))
    test_docs = {doc_id: docs[doc_id] for doc_id in test_doc_ids}

    # Step 1: Calculate TF-IDF and Reduce Vocabulary
    tfidf_vocab = calculate_tfidf({doc_id: docs[doc_id] for doc_id in train_doc_ids}, max_features=tfidf_features)

    # Step 2: Perform Likelihood Ratio on TF-IDF-reduced vocabulary
    lr_vocab = calculate_likelihood_ratio(train_docs, labels, tfidf_vocab)[:lr_features]

    # Train Naive Bayes on LR-selected features
    class_probs, term_probs = train_naive_bayes(class_docs, docs, lr_vocab)

    # Classify testing documents
    test_predictions = {}
    for doc_id, doc in test_docs.items():
        predicted_class = classify_document(doc, lr_vocab, class_probs, term_probs)
        test_predictions[doc_id] = predicted_class

    # Write results to CSV
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write("Id,Value\n")  # Write header
        for doc_id in sorted(test_predictions.keys()):
            file.write(f"{doc_id},{test_predictions[doc_id]}\n")

    print(f"Results saved to {output_path}")

if __name__ == "__main__":
    training_data = """
    1 11 19 29 113 115 169 278 301 316 317 321 324 325 338 341
    2 1 2 3 4 5 6 7 8 9 10 12 13 14 15 16
    3 813 817 818 819 820 821 822 824 825 826 828 829 830 832 833
    4 635 680 683 702 704 705 706 708 709 719 720 722 723 724 726
    5 646 751 781 794 798 799 801 812 815 823 831 839 840 841 842
    6 995 998 999 1003 1005 1006 1007 1009 1011 1012 1013 1014 1015 1016 1019
    7 700 730 731 732 733 735 740 744 752 754 755 756 757 759 760
    8 262 296 304 308 337 397 401 443 445 450 466 480 513 533 534
    9 130 131 132 133 134 135 136 137 138 139 140 141 142 143 145
    10 31 44 70 83 86 92 100 102 305 309 315 320 326 327 328
    11 240 241 243 244 245 248 250 254 255 256 258 260 275 279 295
    12 535 542 571 573 574 575 576 578 581 582 583 584 585 586 588
    13 485 520 523 526 527 529 530 531 532 536 537 538 539 540 541
    """
    folder_path = "/Users/sophiehuang/Documents/113-1/113-1-IRTM/HW3/IRTM"  # Replace with your folder path
    output_path = "output_tfidf_likelihood.csv"  # The output file path

    docs = load_documents(folder_path)
    classify_and_output_with_tfidf_likelihood(
        training_data, 
        docs, 
        output_path, 
        tfidf_features=1000,  # Number of features to keep after TF-IDF
        lr_features=500       # Number of features to keep after Likelihood Ratio
    )

    print(f"Results saved to {output_path}")

