## 0. Setting

#### (1) Base Setting

In [84]:
import json
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi, BM25Plus
from datasets import load_from_disk
from nltk import ngrams

In [62]:
with open('./exp_data/unq_wikipedia_documents.json', 'r', encoding='utf-8') as f:
    wiki_data = json.load(f)

documents = [v['text'] for v in wiki_data.values()]

In [63]:
train_dataset = pd.read_csv('./exp_data/unq_train_dataset.csv')

total_queries = len(train_dataset)

queries = train_dataset['question'].tolist()
correct_doc_ids = train_dataset['doc_id'].tolist()

#### (2) Experiment Module

In [74]:
def apply_n_gram(tokens, n):
    if n == 1:
        return tokens  
    return [' '.join(gram) for gram in ngrams(tokens, n)]

In [75]:
def get_bm25_model(corpus, tokenizer_fn, model_type="okapi", n_gram=1):
    tokenized_corpus = [apply_n_gram(tokenizer_fn(doc), n_gram) for doc in corpus]

    if model_type == "okapi":
        bm25_model = BM25Okapi(tokenized_corpus)
    elif model_type == "plus":
        bm25_model = BM25Plus(tokenized_corpus)
    else:
        raise ValueError("model_type은 'okapi' 또는 'plus' 중 하나여야 합니다.")
    
    return bm25_model

In [65]:
def run_bm25_experiment(queries, correct_doc_ids, bm25_model, tokenize_fn, topk):
    results = []

    for i, query in enumerate(queries):
        tokenized_query = tokenize_fn(query)
        doc_scores = bm25_model.get_scores(tokenized_query)
        top_n_indices = doc_scores.argsort()[::-1][:topk]

        correct_doc_id = correct_doc_ids[i]
        rank = topk + 1  #

        for rank_idx, doc_index in enumerate(top_n_indices):
            if doc_index == correct_doc_id:
                rank = rank_idx + 1
                break

        incorrect_top5 = top_n_indices[:5].tolist() if rank == topk + 1 else None

        results.append({
            'query_id': i,
            'question': query,
            'correct_document_id': correct_doc_id,
            'rank': rank,
            'incorrect_top5': incorrect_top5
        })

    return results

In [94]:
def analyze_experiment_results(results, total_queries, output_path):
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    
    within_20 = sum(results_df['rank'] <= 20)
    within_10 = sum(results_df['rank'] <= 10)
    within_5 = sum(results_df['rank'] <= 5)
    
    within_20_ratio = within_20 / total_queries * 100
    within_10_ratio = within_10 / total_queries * 100
    within_5_ratio = within_5 / total_queries * 100
    
    print(f"Experiment Results:")
    print(f"topk = 20: {within_20} ({within_20_ratio:.2f}%)")
    print(f"topk = 10: {within_10} ({within_10_ratio:.2f}%)")
    print(f"topk =  5: {within_5} ({within_5_ratio:.2f}%)")
    
    return within_20_ratio, within_10_ratio, within_5_ratio

In [76]:
def perform_experiment(queries, correct_doc_ids, documents, tokenize_fn, model_type, topk, total_queries, output_path, n_gram=1):
    """
    실험을 수행하는 함수
    :param queries: 질문 리스트
    :param correct_doc_ids: 각 질문에 대한 정답 문서 ID 리스트
    :param documents: 검색할 문서 리스트
    :param tokenize_fn: 쿼리와 문서를 토크나이징할 함수
    :param model_type: 사용할 BM25 모델 타입 ('okapi' 또는 'plus')
    :param topk: 상위 k개의 문서를 검색
    :param total_queries: 전체 쿼리 수
    :param output_path: 결과를 저장할 파일 경로
    :param n_gram: n-gram에서 사용할 n 값 (기본값은 1, 즉 n-gram 없이 토크나이징)
    """
    bm25_model = get_bm25_model(documents, tokenize_fn, model_type, n_gram)
    results = run_bm25_experiment(queries, correct_doc_ids, bm25_model, lambda query: apply_n_gram(tokenize_fn(query), n_gram), topk)
    analyze_experiment_results(results, total_queries, output_path)

#### (3) Tokenizer

In [78]:
def blank_tokenize(text):
    return text.split(' ')

sample_queries = [blank_tokenize(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")

질문 1: ['대통령을', '포함한', '미국의', '행정부', '견제권을', '갖는', '국가', '기관은?']...
질문 2: ['현대적', '인사조직관리의', '시발점이', '된', '책은?']...
질문 3: ['강희제가', '1717년에', '쓴', '글은', '누구를', '위해', '쓰여졌는가?']...


In [79]:
from konlpy.tag import Okt

okt = Okt()
def okt_tokenize(text):
    return okt.morphs(text)

sample_queries = [okt_tokenize(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")

질문 1: ['대통령', '을', '포함', '한', '미국', '의', '행정부', '견제', '권', '을']...
질문 2: ['현대', '적', '인사', '조직', '관리', '의', '시발', '점', '이', '된']...
질문 3: ['강희제', '가', '1717년', '에', '쓴', '글', '은', '누구', '를', '위해']...


In [87]:
def okt_tokenize_remove_josa(text):
    tokens_pos = okt.pos(text, norm=True, stem=True)
    tokens = [word for word, pos in tokens_pos if pos != 'Josa']
    
    return tokens

sample_queries = [okt_tokenize_remove_josa(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")

질문 1: ['대통령', '포함', '미국', '행정부', '견제', '권', '갖다', '국가', '기관', '?']...
질문 2: ['현대', '적', '인사', '조직', '관리', '시발', '점', '되다', '책', '?']...
질문 3: ['강희제', '1717년', '에', '쓸다', '글', '누구', '위해', '쓰이다', '?']...


In [88]:
def char_tokenize(text):
    return list(text.replace(" ", ""))

sample_queries = [char_tokenize(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")

질문 1: ['대', '통', '령', '을', '포', '함', '한', '미', '국', '의']...
질문 2: ['현', '대', '적', '인', '사', '조', '직', '관', '리', '의']...
질문 3: ['강', '희', '제', '가', '1', '7', '1', '7', '년', '에']...


In [89]:
def char_tokenize_space(text):
    return list(text)

sample_queries = [char_tokenize_space(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")

질문 1: ['대', '통', '령', '을', ' ', '포', '함', '한', ' ', '미']...
질문 2: ['현', '대', '적', ' ', '인', '사', '조', '직', '관', '리']...
질문 3: ['강', '희', '제', '가', ' ', '1', '7', '1', '7', '년']...


In [93]:
from transformers import ElectraTokenizer

monologg_tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

def koelectra_tokenize(text):
    return monologg_tokenizer.tokenize(text)

sample_queries = [koelectra_tokenize(query) for query in queries[:3]]  
for i, query in enumerate(sample_queries):
    print(f"질문 {i+1}: {query[:10]}...")



질문 1: ['대통령', '##을', '포함', '##한', '미국', '##의', '행정부', '견제', '##권', '##을']...
질문 2: ['현대', '##적', '인사', '##조', '##직', '##관리', '##의', '시발점', '##이', '된']...
질문 3: ['강희', '##제', '##가', '171', '##7', '##년', '##에', '쓴', '글', '##은']...


## 1. Experiment

#### (1) Blank & Plus

In [81]:
perform_experiment(
    queries=queries,
    correct_doc_ids=correct_doc_ids, 
    documents=documents,
    tokenize_fn=blank_tokenize,  
    model_type="plus",  
    topk=20, 
    total_queries=len(queries), 
    output_path='./exp_result/blank_plus_n1.csv',
    n_gram=1
)

Experiment Results:
topk = 20: 2516, 63.66%
topk = 10: 2356, 59.62%
topk =  5: 2166, 54.81%


#### (2) Blank & Okapi

In [82]:
perform_experiment(
    queries=queries,
    correct_doc_ids=correct_doc_ids, 
    documents=documents,
    tokenize_fn=blank_tokenize,  
    model_type="okapi",  
    topk=20, 
    total_queries=len(queries), 
    output_path='./exp_result/blank_plus_n1.csv',
    n_gram=1
)

Experiment Results:
topk = 20: 2518, 63.71%
topk = 10: 2352, 59.51%
topk =  5: 2155, 54.53%


#### (3) Blank & Plus & Bigram

In [85]:
perform_experiment(
    queries=queries,
    correct_doc_ids=correct_doc_ids, 
    documents=documents,
    tokenize_fn=blank_tokenize,  
    model_type="plus",  
    topk=20, 
    total_queries=len(queries), 
    output_path='./exp_result/blank_plus_n2.csv',
    n_gram=2
)

Experiment Results:
topk = 20: 1103, 27.91%
topk = 10: 1057, 26.75%
topk =  5: 978, 24.75%


#### (4) Char & Plus & Bigram

In [90]:
perform_experiment(
    queries=queries,
    correct_doc_ids=correct_doc_ids, 
    documents=documents,
    tokenize_fn=char_tokenize,  
    model_type="plus",  
    topk=20, 
    total_queries=len(queries), 
    output_path='./exp_result/char_plus_n2.csv',
    n_gram=2
)

Experiment Results:
topk = 20: 3627, 91.78%
topk = 10: 3496, 88.46%
topk =  5: 3333, 84.34%


#### (5) Monologg & Plus

In [95]:
perform_experiment(
    queries=queries,
    correct_doc_ids=correct_doc_ids, 
    documents=documents,
    tokenize_fn=koelectra_tokenize,  
    model_type="plus",  
    topk=20, 
    total_queries=len(queries), 
    output_path='./exp_result/koele_plus_n.csv',
    n_gram=1
)

Experiment Results:
topk = 20: 3585 (90.71%)
topk = 10: 3449 (87.27%)
topk =  5: 3278 (82.95%)
