## 0. setting

In [1]:
import json
import pandas as pd
from rank_bm25 import BM25Okapi
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/wikipedia_documents.json', 'r', encoding='utf-8') as f:
    wiki_data = json.load(f)

documents = [v['text'] for v in wiki_data.values()]

In [None]:
documents

In [3]:
train_dataset = load_from_disk('../data/train_dataset')
train_split = train_dataset['train']

queries = [train_split[i]['question'] for i in range(len(train_split))]
correct_doc_ids = [train_split[i]['document_id'] for i in range(len(train_split))]

total_queries = 3952

In [4]:
topk = 20

## 1. experiment

#### (1) Base
- 공백 토크나이징
- 전처리 없음

In [5]:
def tokenize1(text):
    return text.split(' ')

tokenized_corpus_1 = [tokenize1(doc) for doc in documents]
bm25_1 = BM25Okapi(tokenized_corpus_1)

In [6]:
results1 = []

for i, query in enumerate(queries):
    tokenized_query = tokenize1(query)
    
    doc_scores = bm25_1.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]
    
    correct_doc_id = correct_doc_ids[i]
    rank = 21  
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1  
            break
    
    results1.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })


In [7]:
results1_df = pd.DataFrame(results1)
results1_df.to_csv('./exp_result/exp1.csv', index=False)

within_20 = sum(results1_df['rank'] <= 20)
within_10 = sum(results1_df['rank'] <= 10)
within_5 = sum(results1_df['rank'] <= 5)

within_20_ratio = within_20 / total_queries * 100
within_10_ratio = within_10 / total_queries * 100
within_5_ratio = within_5 / total_queries * 100

print(f"Experiment 1 - Base':")
print(f"topk = 20: {within_20}, {within_20_ratio:.2f}%")
print(f"topk = 10: {within_10}, {within_10_ratio:.2f}%")
print(f"topk =  5: {within_5}, {within_5_ratio:.2f}%")

OSError: Cannot save file into a non-existent directory: 'exp_result'

#### (2) Remove '?'
- 공백 토크나이징
- question의 ? 만 제거

In [32]:
questions_no_question_mark = [question.replace('?', '') for question in questions]

def tokenize2(text):
    return text.split(' ')

tokenized_corpus_2 = [tokenize2(doc) for doc in documents]
bm25_2 = BM25Okapi(tokenized_corpus_2)


In [33]:
results2 = []

for i, query in enumerate(queries):
    tokenized_query = tokenize2(query)
    
    doc_scores = bm25_2.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]
    
    correct_doc_id = correct_doc_ids[i]
    rank = 21  
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1  
            break
    
    results2.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })

In [34]:
results2_df = pd.DataFrame(results2)
results2_df.to_csv('./exp_result/exp2.csv', index=False)

within_20_2 = sum(results2_df['rank'] <= 20)
within_10_2 = sum(results2_df['rank'] <= 10)
within_5_2 = sum(results2_df['rank'] <= 5)

within_20_ratio_2 = within_20_2 / total_queries * 100
within_10_ratio_2 = within_10_2 / total_queries * 100
within_5_ratio_2 = within_5_2 / total_queries * 100

print(f"Exp 2 - Remove '?':")
print(f"topk = 20: {within_20_2}, {within_20_ratio_2:.2f}%")
print(f"topk = 10: {within_10_2}, {within_10_ratio_2:.2f}%")
print(f"topk =  5: {within_5_2}, {within_5_ratio_2:.2f}%")

Exp 2 - Remove '?':
topk = 20: 2514, 63.61%
topk = 10: 2341, 59.24%
topk =  5: 2144, 54.25%


#### (3) Okt tokenizer
- okt tokenizer 사용

In [35]:
from konlpy.tag import Okt

okt = Okt()
def tokenize3(text):
    return okt.morphs(text)

tokenized_corpus_3 = [tokenize3(doc) for doc in documents]
bm25_3 = BM25Okapi(tokenized_corpus_3)

In [36]:
results3 = []

for i, query in enumerate(queries):
    tokenized_query = tokenize3(query)
    doc_scores = bm25_3.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]
    
    correct_doc_id = correct_doc_ids[i]
    rank = 21  
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1  
            break
    
    results3.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })

In [37]:
results3_df = pd.DataFrame(results3)
results3_df.to_csv('./exp_result/exp3.csv', index=False)

within_20_3 = sum(results3_df['rank'] <= 20)
within_10_3 = sum(results3_df['rank'] <= 10)
within_5_3 = sum(results3_df['rank'] <= 5)

within_20_ratio_3 = within_20_3 / total_queries * 100
within_10_ratio_3 = within_10_3 / total_queries * 100
within_5_ratio_3 = within_5_3 / total_queries * 100

print(f"Experiment 3 - Okt Tokenizer:")
print(f"topk = 20: {within_20_3}, {within_20_ratio_3:.2f}%")
print(f"topk = 10: {within_10_3}, {within_10_ratio_3:.2f}%")
print(f"topk =  5: {within_5_3}, {within_5_ratio_3:.2f}%")

Experiment 3 - Okt Tokenizer:
topk = 20: 3545, 89.70%
topk = 10: 3418, 86.49%
topk =  5: 3222, 81.53%


#### (4) Okt tokentizer & Remove 'Josa'
- okt tokenizer 사용
- 형태소 분석기를 통한 조사 제거 전처리

In [38]:
def remove_josa(tokens):
    return [word for word, tag in okt.pos(' '.join(tokens)) if tag != 'Josa']

tokenized_corpus_4 = [remove_josa(tokens) for tokens in tokenized_corpus_3]
bm25_4 = BM25Okapi(tokenized_corpus_4)

In [40]:
results4 = []

for i, query in enumerate(queries):
    tokenized_query = remove_josa(tokenize3(query))  
    doc_scores = bm25_4.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]
    
    correct_doc_id = correct_doc_ids[i]
    rank = 21  
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1 
            break
    
    results4.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })

In [41]:
results4_df = pd.DataFrame(results4)
results4_df.to_csv('./exp_result/exp4.csv', index=False)

within_20_4 = sum(results4_df['rank'] <= 20)
within_10_4 = sum(results4_df['rank'] <= 10)
within_5_4 = sum(results4_df['rank'] <= 5)

within_20_ratio_4 = within_20_4 / total_queries * 100
within_10_ratio_4 = within_10_4 / total_queries * 100
within_5_ratio_4 = within_5_4 / total_queries * 100

print(f"Experiment 4 - Okt Tokenizer + remove 'Josa':")
print(f"topk = 20: {within_20_4}, {within_20_ratio_4:.2f}%")
print(f"topk = 10: {within_10_4}, {within_10_ratio_4:.2f}%")
print(f"topk =  5: {within_5_4}, {within_5_ratio_4:.2f}%")

Experiment 4 - Okt Tokenizer + remove 'Josa':
topk = 20: 3570, 90.33%
topk = 10: 3434, 86.89%
topk =  5: 3248, 82.19%


In [42]:
import pickle
import json

with open('./pickle/exp3.pkl', 'wb') as f:
    pickle.dump(tokenized_corpus_3 , f)

with open('./pickle/exp4.pkl', 'wb') as f:
    pickle.dump(tokenized_corpus_4 , f)

#### (5) Okt tokentizer & Remove 'Josa' simple
- okt tokenizer 사용
- 간단한 방식의 조사 제거 전처리

In [44]:
with open('./pickle/exp3.pkl', 'rb') as f:
    tokenized_corpus_3 = pickle.load(f)

def remove_josa(tokens):
    josa_tags = ['은', '는', '이', '가', '을', '를', '에', '의', '도']
    return [token for token in tokens if token not in josa_tags]

tokenized_corpus_5 = [remove_josa(tokens) for tokens in tokenized_corpus_3]
bm25_5 = BM25Okapi(tokenized_corpus_5)

In [45]:
results5 = []

for i, query in enumerate(queries):
    tokenized_query = remove_josa(tokenize3(query))  
    doc_scores = bm25_5.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]  

    correct_doc_id = correct_doc_ids[i]  
    rank = 21 
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1  
            break
    
    results5.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })

In [47]:
results5_df = pd.DataFrame(results_exp5)
results5_df.to_csv('./exp_result/exp5.csv', index=False)

within_20_5 = sum(results5_df['rank'] <= 20)
within_10_5 = sum(results5_df['rank'] <= 10)
within_5_5 = sum(results5_df['rank'] <= 5)

within_20_ratio_5 = within_20_5 / total_queries * 100
within_10_ratio_5 = within_10_5 / total_queries * 100
within_5_ratio_5 = within_5_5 / total_queries * 100

print(f"Experiment 5 - Okt Tokenizer + remove 'Josa' simple:")
print(f"topk = 20: {within_20_5}, {within_20_ratio_5:.2f}%")
print(f"topk = 10: {within_10_5}, {within_10_ratio_5:.2f}%")
print(f"topk =  5: {within_5_5}, {within_5_ratio_5:.2f}%")

Experiment 5 - Okt Tokenizer + remove 'Josa' simple:
topk = 20: 3542, 89.63%
topk = 10: 3408, 86.23%
topk =  5: 3223, 81.55%


#### (6) Char tokenizer & Trigram
- character 단위 토크나이징
- Trigram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def char_ngram_tokenize(text, n=3):
    chars = list(text)
    ngrams = [''.join(chars[i:i+n]) for i in range(len(chars)-n+1)]
    return ngrams

n = 3
tokenized_corpus_6 = [char_ngram_tokenize(doc, n=n) for doc in documents]
bm25_6 = BM25Okapi(tokenized_corpus_6)

In [None]:
results_exp6 = []

for i, query in enumerate(queries):
    tokenized_query = char_ngram_tokenize(query, n=n)
    doc_scores = bm25_6.get_scores(tokenized_query)
    top_n_indices = doc_scores.argsort()[::-1][:topk]  
    
    correct_doc_id = correct_doc_ids[i]  
    rank = 21
    
    for rank_idx, doc_index in enumerate(top_n_indices):
        if doc_index == correct_doc_id:
            rank = rank_idx + 1  
            break
    
    results_exp6.append({
        'query_id': i,
        'question': query,
        'correct_document_id': correct_doc_id,
        'rank': rank
    })

In [None]:
results6_df = pd.DataFrame(results5)
results6_df.to_csv('./exp_result/exp6.csv', index=False)

within_20_6 = sum(results6_df['rank'] <= 20)
within_10_6 = sum(results6_df['rank'] <= 10)
within_5_6 = sum(results6_df['rank'] <= 5)

within_20_ratio_6 = within_20_6 / total_queries * 100
within_10_ratio_6 = within_10_6 / total_queries * 100
within_5_ratio_6 = within_5_6 / total_queries * 100

print(f"Experiment 6 - char tokenizing & trigram:")
print(f"topk = 20: {within_20_6}, {within_20_ratio_6:.2f}%")
print(f"topk = 10: {within_10_6}, {within_10_ratio_6:.2f}%")
print(f"topk =  5: {within_5_6}, {within_5_ratio_6:.2f}%")