In [1]:
from datasets import concatenate_datasets, load_from_disk
from src.retrieval.retrieval import SparseRetrieval
from transformers import AutoTokenizer
from src.score.ranking import check_original_in_context, calculate_reverse_rank_score, calculate_linear_score
org_dataset = load_from_disk('./data/train_dataset')
print(org_dataset)
full_ds = concatenate_datasets(
        [
            org_dataset["train"].flatten_indices(),
            org_dataset["validation"].flatten_indices(),
        ]
    )  # train dev 를 합친 4192 개 질문에 대해 모두 테스트
print("*" * 40, "query dataset", "*" * 40)
print(full_ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
        num_rows: 3952
    })
    validation: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
        num_rows: 240
    })
})
**************************************** query dataset ****************************************
Dataset({
    features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
    num_rows: 4192
})


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", use_fast=False,)
    # 위에서 선언한거 가져오기 Retriever
retriever = SparseRetrieval(
    tokenize_fn=tokenizer.tokenize,
    data_path="./data/",
    context_path="wikipedia_documents.json",
    mode = "tfidf",
)
retriever.get_sparse_embedding()


Lengths of unique contexts : 56737
Building tfidf embedding...
Start Initializing...


Tokenizing...: 100%|██████████| 56737/56737 [01:59<00:00, 473.23it/s]


Building Vocab


빌딩 어휘: 100%|██████████| 56737/56737 [00:01<00:00, 50522.94it/s]


Calculate doc frequency


문서 빈도 계산: 100%|██████████| 56737/56737 [00:02<00:00, 20456.05it/s]


Current mode : tfidf
End Initialization
Finish Sklearn TF-IDF Embedding
New embeddings calculated and saved.
tfidf embedding shape: (56737, 50000)


In [3]:
df = retriever.retrieve(full_ds, topk=10)
df["correct"] = df.apply(check_original_in_context, axis=1)
df["rmm_score"] = df.apply(calculate_reverse_rank_score, axis=1)
df["linear_score"] = df.apply(calculate_linear_score, axis=1)
print(
    "correct retrieval",
    df["correct"].sum() / len(df),
)
print(
    "reverse rank retrieval",
    df["rmm_score"].sum() / len(df)
)
print(
    "linear retrieval",
    df["linear_score"].sum() / len(df)
)

(4192, 50000) (56737, 50000)
result shape : (4192, 56737)
[query exhaustive search] done in 28.676 s


Sparse retrieval: 100%|██████████| 4192/4192 [00:01<00:00, 3929.99it/s]


correct retrieval 0.6063931297709924
reverse rank retrieval 0.23666993470962353
linear retrieval 0.4822570194053708


In [5]:
df = retriever.retrieve(full_ds, topk=1)
df["correct"] = df.apply(check_original_in_context, axis=1)
df["rmm_score"] = df.apply(calculate_reverse_rank_score, axis=1)
df["linear_score"] = df.apply(calculate_linear_score, axis=1)
print(
    "correct retrieval result by fiass search",
    df["correct"].sum() / len(df),
)
print(
    "mrr retrieval result by fiass search",
    df["rmm_score"].sum() / len(df)
)
print(
    "linear retrieval result by fiass search",
    df["linear_score"].sum() / len(df)
)

<class 'scipy.sparse._csr.csr_matrix'> (1, 46680)
(4192, 46680) <class 'scipy.sparse._csr.csr_matrix'>
유사도 계산
(4192, 46680) (56737, 46680)
유사도 후
result shape : (4192, 56737)
[query exhaustive search] done in 21.866 s


Sparse retrieval: 100%|██████████| 4192/4192 [00:00<00:00, 9681.34it/s]


correct retrieval result by fiass search 0.35854007633587787
mrr retrieval result by fiass search 0.35854007633587787
linear retrieval result by fiass search 0.35854007633587787


In [None]:
df["correct"] = df.apply(check_original_in_context, axis=1)
df["rmm_score"] = df.apply(calculate_reverse_rank_score, axis=1)
df["linear_score"] = df.apply(calculate_linear_score, axis=1)
print(
    "correct retrieval result by fiass search",
    df["correct"].sum() / len(df),
)
print(
    "mrr retrieval result by fiass search",
    df["rmm_score"].sum() / len(df)
)
print(
    "linear retrieval result by fiass search",
    df["linear_score"].sum() / len(df)
)   