In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from konlpy.tag import Mecab
from collections import defaultdict

In [2]:
train_df = pd.read_csv('./data/labeled_train.csv')
print(len(train_df))
train_df = train_df[train_df['page'] != '없음']
print(len(train_df))

496
490


In [3]:
q_ids = train_df['SAMPLE_ID'].tolist()
q_source = train_df['Source'].tolist()
q_page = train_df['page'].tolist()
queries = train_df['Question'].tolist()

In [4]:
q_answers = []
for i, cell in enumerate(q_page):
    q_page[i] = [x.strip() for x in cell.split(",")]
    cell_answers = []
    for j, subcell in enumerate(q_page[i]):
        if '-' in subcell:
            q_page[i][j] = subcell.replace("-", "")
        else:
            q_page[i][j] = subcell + "0"
        cell_answers.append(q_source[i] + " " + q_page[i][j])
    q_answers.append(cell_answers)

In [5]:
doc_df = pd.read_csv('./data/processed_train.csv')

In [6]:
doc_ids = doc_df['doc_id'].tolist()
docs = doc_df['doc'].tolist()
print(len(doc_df))

1092


In [7]:
mecab = Mecab()
tokenized_doc = [mecab.morphs(doc) for doc in tqdm(docs)]
tokenized_q = [mecab.morphs(q) for q in tqdm(queries)]

  0%|          | 0/1092 [00:00<?, ?it/s]

100%|██████████| 1092/1092 [00:01<00:00, 983.85it/s] 
100%|██████████| 490/490 [00:00<00:00, 16149.56it/s]


In [8]:
grouped_docs = defaultdict(list)
grouped_tokens = defaultdict(list)
doc_id_mapping = defaultdict(list)

for doc_id, tokens, doc in zip(doc_ids, tokenized_doc, docs):
    key = doc_id.rsplit(' ', 1)[0]
    grouped_docs[key].append(doc)
    grouped_tokens[key].append(tokens)
    doc_id_mapping[key].append(doc_id)

bm25_dict = {key: BM25Okapi(grouped_tokens[key], b=1.0, k1=2.5) for key in grouped_tokens}


In [21]:
top_k = 20

recall1, recall5, recall10, recall20 = 0, 0, 0, 0

missed = []
for i, tq in enumerate(tqdm(tokenized_q)):

    source = q_source[i] # 어떤 pdf를 검색해라
    answers = q_answers[i]  # 그 pdf의 page list
    doc_scores = bm25_dict[source].get_scores(tq)
    top_indices = np.argsort(doc_scores)[-top_k:][::-1]

    for rank, idx in enumerate(top_indices):
        if doc_id_mapping[source][idx] in answers:
            if rank < 1:
                recall1 += 1
            if rank < 5:
                recall5 += 1
            if rank < 10:
                recall10 += 1
            if rank < 20:
                recall20 += 1
            break

print(recall1 / len(tokenized_q))
print(recall5 / len(tokenized_q))
print(recall10 / len(tokenized_q))
print(recall20 / len(tokenized_q))

print(missed)

100%|██████████| 490/490 [00:00<00:00, 880.21it/s] 

0.5489795918367347
0.8448979591836735
0.8938775510204081
0.9306122448979591
[]



