In [1]:
import os
import json
import time
import faiss
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from contextlib import contextmanager
from typing import List, Tuple, NoReturn, Any, Optional, Union
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer
from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
    Features,
    Value,
    DatasetDict,
)
from retrieval import *
from tqdm import tqdm

In [2]:
dataset = "../data/train_dataset"
org_dataset = load_from_disk(dataset)
tokenizer = AutoTokenizer.from_pretrained(
        "klue/bert-base",
        use_fast=False,
        )
retriever = SparseRetrieval(
        tokenize_fn=tokenizer.tokenize,
        data_path="../data/",
        context_path="wikipedia_documents.json",
    )
full_ds = concatenate_datasets(
        [
            org_dataset["train"].flatten_indices(),
            org_dataset["validation"].flatten_indices(),
        ]
    )

Loading cached processed dataset at ../data/train_dataset/train/cache-fbc57aa6e699fb0c.arrow
Loading cached processed dataset at ../data/train_dataset/validation/cache-d2fba0c42123b1d6.arrow


Lengths of unique contexts : 56737


In [7]:
def topk_experiment(topK_list):
    result_dict = {}
    retriever.get_sparse_embedding()
    for topK in tqdm(topK_list):
        result_retriever = retriever.retrieve(full_ds,topk = topK)
        correct = 0
        for index in range(len(result_retriever)):
            if  result_retriever['original_context'][index][:200] in result_retriever['context'][index]:
                correct += 1
        result_dict[topK] = correct/len(result_retriever)
    return result_dict

In [6]:
topK_list = [1,10,20]
result = topk_experiment(topK_list)
result

Embedding pickle load.


  0%|          | 0/3 [00:00<?, ?it/s]

[query exhaustive search] done in 44.737 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [00:45<01:30, 45.24s/it]

[query exhaustive search] done in 45.206 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [01:31<00:45, 45.55s/it]

[query exhaustive search] done in 45.672 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

100%|██████████| 3/3 [02:17<00:00, 45.82s/it]


{1: 0.2738549618320611, 10: 0.6383587786259542, 20: 0.7299618320610687}

In [8]:
topK_list = [1,10,20]
result = topk_experiment(topK_list)
result

Embedding pickle load.


  0%|          | 0/3 [00:00<?, ?it/s]

[query exhaustive search] done in 44.772 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [00:45<01:30, 45.26s/it]

[query exhaustive search] done in 45.169 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [01:31<00:45, 45.75s/it]

[query exhaustive search] done in 45.637 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

100%|██████████| 3/3 [02:17<00:00, 45.90s/it]


{1: 0.25190839694656486, 10: 0.6357347328244275, 20: 0.7278148854961832}

In [83]:
retriever.get_sparse_embedding()

Embedding pickle load.


In [18]:
df = retriever.retrieve(full_ds,topk = 20)

[query exhaustive search] done in 45.469 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

In [19]:
df["correct"] = df["original_context"] == df["context"]
print("correct retrieval result by exhaustive search",
        df["correct"].sum() / len(df),)

correct retrieval result by exhaustive search 0.0


In [152]:
df["context"][1][:620] ==  df['original_context'][1][:620]

True

In [153]:
df["context"][1][620:630] ==  df['original_context'][1][620:630]

False

In [9]:
df['context'][1][500:630]

NameError: name 'df' is not defined

In [155]:
df['original_context'][1][500:630]

"해에서 다양한 기능을 인사조직관리의 목적, 경영의 목적을 위해서 다양한 분야를 통합하여 '유기적 기업 조직' 이해로 전환되었다. 이 통합적 접근방식은 과정, 시스템, 상황을 중심으로 하는 인사조직관리 방식을 형성했다."

In [126]:
correct = 0
for index in range(len(df)):
    if df["context"][index][:400] == df['original_context'][index][:400]:
        correct += 1
print(correct/len(df))

0.2555668016194332


In [156]:
df['context'][1][:200] in df['original_context'][1]

True

In [157]:
len(df["context"][1]), len(df['original_context'][1])

(70376, 621)

In [150]:
result = topk_experiment(topK_list)

Embedding pickle load.


  0%|          | 0/8 [00:00<?, ?it/s]

[query exhaustive search] done in 46.434 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 12%|█▎        | 1/8 [00:46<05:28, 46.91s/it]

[query exhaustive search] done in 46.876 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 25%|██▌       | 2/8 [01:34<04:43, 47.20s/it]

[query exhaustive search] done in 47.391 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 38%|███▊      | 3/8 [02:22<03:57, 47.58s/it]

[query exhaustive search] done in 46.819 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 50%|█████     | 4/8 [03:10<03:10, 47.69s/it]

[query exhaustive search] done in 46.514 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 62%|██████▎   | 5/8 [03:57<02:22, 47.50s/it]

[query exhaustive search] done in 45.790 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 75%|███████▌  | 6/8 [04:44<01:34, 47.21s/it]

[query exhaustive search] done in 45.502 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

 88%|████████▊ | 7/8 [05:30<00:47, 47.08s/it]

[query exhaustive search] done in 44.790 s


Sparse retrieval:   0%|          | 0/4192 [00:00<?, ?it/s]

100%|██████████| 8/8 [06:18<00:00, 47.27s/it]


In [151]:
result

{1: 0.2738549618320611,
 5: 0.5415076335877863,
 10: 0.6383587786259542,
 20: 0.7299618320610687,
 30: 0.7736164122137404,
 60: 0.8358778625954199,
 100: 0.8730916030534351,
 200: 0.9131679389312977}