In [1]:
import numpy as np
import pandas as pd

def load_data(documents_path, train_questions_path, test_questions_path):
    try:
        documents = pd.read_csv(documents_path)
        train_questions = pd.read_csv(train_questions_path)
        test_questions = pd.read_csv(test_questions_path)
        return documents, train_questions, test_questions
    except FileNotFoundError as e:
        print(f"Error: File not found. {e}")
        return None, None, None

def preprocess_text(text):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for p in punctuation:
        text = text.replace(p, ' ')
    words = text.lower().split()
    stopwords = {'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or', 'but'}
    return [word for word in words if word not in stopwords]

def create_vocabulary_and_doc_term_matrix(documents):
    all_words = []
    for doc in documents:
        all_words.extend(preprocess_text(doc))
    
    vocabulary = sorted(set(all_words))
    vocab_dict = {word: idx for idx, word in enumerate(vocabulary)}
    
    matrix = np.zeros((len(documents), len(vocabulary)))
    
    for i, doc in enumerate(documents):
        words = preprocess_text(doc)
        unique_words, counts = np.unique(words, return_counts=True)
        for word, count in zip(unique_words, counts):
            if word in vocab_dict:
                matrix[i, vocab_dict[word]] = count
    
    df_vector = np.sum(matrix > 0, axis=0)
    
    return vocabulary, matrix, df_vector

def vector_model(doc_matrix, query_vector, N, df_vector):
    idf = np.log((N + 1) / (df_vector + 1)) + 1
    query_weights = np.where(query_vector > 0, 1 + np.log(query_vector), 0) * idf
    query_norm = np.linalg.norm(query_weights)

    doc_weights = np.where(doc_matrix > 0, 1 + np.log(doc_matrix), 0) * idf
    doc_norms = np.linalg.norm(doc_weights, axis=1)

    similarities = np.dot(doc_weights, query_weights) / (doc_norms * query_norm + 1e-8)
    return similarities

class BM25:
    def __init__(self, documents, k1=1.5, b=0.75):
        self.documents = documents
        self.vocabulary, self.doc_term_matrix, self.df_vector = create_vocabulary_and_doc_term_matrix(documents)
        self.doc_count = len(documents)
        self.avgdl = np.mean(np.sum(self.doc_term_matrix, axis=1))
        self.k1 = k1
        self.b = b
        self.idf = self._calculate_idf()
        self.doc_lens = np.sum(self.doc_term_matrix, axis=1)
        self.denominator_base = self.k1 * (1 - self.b + self.b * self.doc_lens / self.avgdl)

    def _calculate_idf(self):
        return np.log((self.doc_count - self.df_vector + 0.5) / (self.df_vector + 0.5) + 1)

    def get_scores(self, query):
        query_terms = preprocess_text(query)
        query_vector = np.zeros(len(self.vocabulary))
        for term in query_terms:
            if term in set(self.vocabulary):
                query_vector[self.vocabulary.index(term)] += 1
        
        scores = np.zeros(self.doc_count)
        query_term_indices = [i for i, count in enumerate(query_vector) if count > 0]
        
        for idx in query_term_indices:
            f = self.doc_term_matrix[:, idx]
            scores += (self.idf[idx] * f * (self.k1 + 1) /
                      (f + self.denominator_base))
        
        return scores

def get_top_3(scores, doc_ids):
    top_3_indices = np.argpartition(scores, -3)[-3:]
    top_3_indices = top_3_indices[np.argsort(-scores[top_3_indices])]
    return [doc_ids[i] for i in top_3_indices]

def combine_scores(vector_scores, bm25_scores, vector_model_weight=0.5, bm25_weight=0.5):
    combined_scores = (vector_model_weight * vector_scores) + (bm25_weight * bm25_scores)
    return combined_scores

def main():
    documents_path = "/kaggle/input/ir-hw1/documents_data.csv"
    train_questions_path = "/kaggle/input/ir-hw1/test_question.csv"
    test_questions_path = "/kaggle/input/ir-hw1/test_question.csv"
    output_path = "/kaggle/working/submission.csv"

    documents, train_questions, test_questions = load_data(documents_path, train_questions_path, test_questions_path)
    if documents is None or train_questions is None or test_questions is None:
        return

    doc_texts = documents['Document_HTML'].tolist()
    doc_ids = documents['Document ID'].tolist()
    test_questions_texts = test_questions['Question'].tolist()
    total_questions = len(test_questions_texts)

    vocabulary, doc_term_matrix, df_vector = create_vocabulary_and_doc_term_matrix(doc_texts)
    N = len(doc_texts)
    bm25 = BM25(doc_texts)
    
    print(f"Total questions to process: {total_questions}")
    results = []
    for i, query in enumerate(test_questions_texts, 1):
        print(f"Processing question {i}/{total_questions}")
        
        query_vector = np.zeros(len(vocabulary))
        for term in preprocess_text(query):
            if term in vocabulary:
                query_vector[vocabulary.index(term)] += 1

        vector_similarities = vector_model(doc_term_matrix, query_vector, N, df_vector)
        bm25_scores = bm25.get_scores(query)

        # 調整權重，向量模型權重增加
        combined_scores = combine_scores(vector_similarities, bm25_scores, vector_model_weight=0.4, bm25_weight=0.6)
        combined_top_3_ids = get_top_3(combined_scores, doc_ids)
        results.append(' '.join(map(str, combined_top_3_ids)))

    df = pd.read_csv("/kaggle/input/ir-hw1/sample_submission.csv")
    df['answer'] = results
    df.to_csv(output_path, index=False)
    print("Submission file created: sample_submission.csv")

if __name__ == "__main__":
    main()



Total questions to process: 400
Processing question 1/400


  query_weights = np.where(query_vector > 0, 1 + np.log(query_vector), 0) * idf
  doc_weights = np.where(doc_matrix > 0, 1 + np.log(doc_matrix), 0) * idf


Processing question 2/400
Processing question 3/400
Processing question 4/400
Processing question 5/400
Processing question 6/400
Processing question 7/400
Processing question 8/400
Processing question 9/400
Processing question 10/400
Processing question 11/400
Processing question 12/400
Processing question 13/400
Processing question 14/400
Processing question 15/400
Processing question 16/400
Processing question 17/400
Processing question 18/400
Processing question 19/400
Processing question 20/400
Processing question 21/400
Processing question 22/400
Processing question 23/400
Processing question 24/400
Processing question 25/400
Processing question 26/400
Processing question 27/400
Processing question 28/400
Processing question 29/400
Processing question 30/400
Processing question 31/400
Processing question 32/400
Processing question 33/400
Processing question 34/400
Processing question 35/400
Processing question 36/400
Processing question 37/400
Processing question 38/400
Processin