In [1]:
from transformers import AutoTokenizer
from datasets import load_from_disk

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

import os
import time
import json
import pickle
from contextlib import contextmanager

In [2]:
with open('../../data/wikipedia_documents.json', 'r', encoding='utf-8') as f:
    wiki = json.load(f)

wiki_contents = list(dict.fromkeys([v['text'] for v in wiki.values()]))

In [3]:
print('wikipedia data length:', len(wiki.keys()))

print('wikipedia set data length:', len(wiki_contents))

wikipedia data length: 60613
wikipedia set data length: 56737


In [4]:
# MODEL_NAME = 'bert-base-multilingual-cased'
MODEL_NAME = 'monologg/kobigbird-bert-base'
MAX_FEATURES = 100000

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MODEL_NAME = MODEL_NAME.split('/')[-1]
sparse_embedding_path = f'../../data/{MODEL_NAME}_spare_embedding_{MAX_FEATURES}.bin'
tfidfv_path = f'../../data/{MODEL_NAME}_tfidfv_{MAX_FEATURES}.bin'

if os.path.isfile(sparse_embedding_path) and os.path.isfile(tfidfv_path):
    with open(sparse_embedding_path, "rb") as file:
        wiki_tfidf = pickle.load(file)
    with open(tfidfv_path, "rb") as file:
        tfidfv = pickle.load(file)
    print("Embedding pickle load.")
else:
    print("Build passage embedding")
    tfidfv = TfidfVectorizer(
        tokenizer=tokenizer.tokenize,
        ngram_range=(1, 2),
        max_features=MAX_FEATURES,
    )
    wiki_tfidf = tfidfv.fit_transform(wiki_contents)
    print(wiki_tfidf.shape)
    with open(sparse_embedding_path, "wb") as file:
        pickle.dump(wiki_tfidf, file)
    with open(tfidfv_path, "wb") as file:
        pickle.dump(tfidfv, file)
    print("Embedding pickle saved.")

Embedding pickle load.


In [5]:
print('wiki TF-IDF shape:', wiki_tfidf.shape)

wiki TF-IDF shape: (56737, 100000)


In [6]:
org_dataset = load_from_disk('../../data/train_dataset')

validation_query_tfidf = tfidfv.transform(org_dataset['validation']['question'])

In [7]:
result = validation_query_tfidf * wiki_tfidf.T

In [8]:
if not isinstance(result, np.ndarray):
    result = result.toarray()

In [9]:
result.shape

(240, 56737)

In [10]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f"[{name}] done in {time.time() - t0:.3f} s")

In [12]:
for i in range(1, 21):
    with timer(f'TOP K: {i}'):
        TOPK = i
        doc_scores = []
        doc_indices = []
        for j in range(result.shape[0]):
            sorted_result = np.argsort(result[j, :])[::-1]
            doc_scores.append(result[j, :][sorted_result].tolist()[:TOPK])
            doc_indices.append(sorted_result.tolist()[:TOPK])

        # Context Accuracy
        # correct = 0
        # for idx, doc_indice in enumerate(doc_indices):
        #     for jdx, indice in enumerate(doc_indice):
        #         if org_dataset['validation']['context'][idx] == wiki_contents[indice]:
        #             correct += 1
        
        # Label Accuracy
        correct = 0
        for idx, doc_indice in enumerate(doc_indices):
            for jdx, indice in enumerate(doc_indice):
                if org_dataset['validation']['answers'][idx]['text'][0] in wiki_contents[indice]:
                    correct += 1
                    break

        print(f"Total Validation Score: {correct/len(org_dataset['validation'])*100}%")

Total Validation Score: 42.083333333333336%
[TOP K: 1] done in 2.478 s
Total Validation Score: 50.416666666666664%
[TOP K: 2] done in 2.865 s
Total Validation Score: 60.416666666666664%
[TOP K: 3] done in 3.123 s
Total Validation Score: 65.41666666666667%
[TOP K: 4] done in 3.374 s
Total Validation Score: 69.16666666666667%
[TOP K: 5] done in 3.575 s
Total Validation Score: 71.66666666666667%
[TOP K: 6] done in 4.269 s
Total Validation Score: 73.33333333333333%
[TOP K: 7] done in 3.948 s
Total Validation Score: 75.41666666666667%
[TOP K: 8] done in 4.132 s
Total Validation Score: 77.08333333333334%
[TOP K: 9] done in 4.298 s
Total Validation Score: 77.91666666666667%
[TOP K: 10] done in 4.382 s
Total Validation Score: 78.33333333333333%
[TOP K: 11] done in 4.523 s
Total Validation Score: 78.75%
[TOP K: 12] done in 4.676 s
Total Validation Score: 78.75%
[TOP K: 13] done in 4.807 s
Total Validation Score: 78.75%
[TOP K: 14] done in 4.920 s
Total Validation Score: 80.41666666666667%
[TOP 

In [None]:
# train validation to csv

# df = pd.DataFrame(result)
# df.to_csv(f'Train-Validation_{MODEL_NAME}_TF-IDF_{MAX_FEATURES}.csv', index=False)
# df

In [None]:
# test validation to csv

# org_dataset_test = load_from_disk('../../data/test_dataset')
# print(org_dataset_test)

# test_query_tfidf = tfidfv.transform(org_dataset_test['validation']['question'])
# print('Test Validation Shape:', test_query_tfidf.shape)

# result_test = test_query_tfidf * wiki_tfidf.T
# if not isinstance(result_test, np.ndarray):
#     result_test = result_test.toarray()
# print('Scores Shape & Type:', result_test.shape, type(result_test))

# df = pd.DataFrame(result_test)
# df.to_csv(f'Test-Validation_{MODEL_NAME}_TF-IDF_{MAX_FEATURES}.csv', index=False)
# df