- https://huggingface.co/jhgan/ko-sroberta-multitask\
jhgan/ko-sroberta-multitask 모델을 활용

In [2]:
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import json
import pprint

## 기존 학습 데이터셋 내 context

In [4]:
datasets = load_from_disk('./data/train_dataset')

In [5]:
total_contexts = datasets['train']['context']+datasets['validation']['context']

In [6]:
len(total_contexts)

4192

## wiki passages

In [10]:
import json
import pprint

In [11]:
with open('./data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

In [12]:
total_wiki_passages = [doc['text'] for doc in wiki.values()]

In [13]:
len(total_wiki_passages)

60613

In [14]:
from transformers import AutoTokenizer, AutoModel

In [15]:
tokenizer = AutoTokenizer.from_pretrained("jhgan/ko-sroberta-multitask")

In [29]:
model = AutoModel.from_pretrained("jhgan/ko-sroberta-multitask", cache_dir ='/data/ephemeral/tmp').to('cuda')

In [20]:
model.config

RobertaConfig {
  "_name_or_path": "jhgan/ko-sroberta-multitask",
  "architectures": [
    "RobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

In [13]:
import pandas as pd

In [17]:
s = pd.Series(total_wiki_passages)
s.apply(lambda x: len(tokenizer(x)['input_ids'])).describe()

Token indices sequence length is longer than the specified maximum sequence length for this model (1133 > 512). Running this sequence through the model will result in indexing errors


count    60613.000000
mean       406.209955
std        430.679118
min         56.000000
25%        222.000000
50%        309.000000
75%        459.000000
max      27541.000000
dtype: float64

In [21]:
s = pd.Series(total_contexts)
s.apply(lambda x: len(tokenizer(x)['input_ids'])).describe()

count    4192.000000
mean      497.244513
std       193.999246
min       241.000000
25%       347.000000
50%       444.000000
75%       594.000000
max      1174.000000
dtype: float64

## Wiki Embeddings

In [66]:
# 중복 제거
total_wiki_passages = list(set(total_wiki_passages))
len(total_wiki_passages)

56737

In [67]:
tokenized = tokenizer(total_wiki_passages, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

In [69]:
tokenized['input_ids'].shape

torch.Size([56737, 512])

In [70]:
import torch
torch.cuda.is_available()

True

In [71]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx].to('cuda') for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])
    
dataset = CustomDataset(tokenized)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [72]:
total_embeddings = []

for batch in tqdm(dataloader):
    model.eval()
    with torch.no_grad():
        outputs = model(**batch)
        total_embeddings.append(outputs.last_hidden_state[:,0].cpu())

100%|██████████| 1774/1774 [08:27<00:00,  3.50it/s]


In [73]:
total_embeddings = torch.cat(total_embeddings)
total_embeddings.size()

torch.Size([56737, 768])

In [74]:
# save embeddings
emb_pt_name = 'index_to_vector.pt'
torch.save(total_embeddings, emb_pt_name)

## Build Faiss Index

In [75]:
import os

index_name = 'sent_emb.index'
index_path = os.path.join('./', index_name)

In [76]:
from copy import deepcopy

In [77]:
# No Quantization, No Pruining
import faiss

d = total_embeddings.size(1)
emb = deepcopy(total_embeddings.numpy())

index = faiss.IndexFlatIP(d) # Inner Product (dot-product) index
faiss.normalize_L2(emb) # normalize embeddings
index.add(emb)

faiss.write_index(index, index_path)

In [78]:
# test
import faiss

index = faiss.read_index(index_path)

k = 5
D, I = index.search(emb[:5], k)

print(I) # nearest neighbor의 인덱스
print()
print(D) # nearest neighbor의 cosine similarity

[[    0 21886 32648 14790 18456]
 [    1 16354 46178 30760 51363]
 [    2 17101 45592 36065 51686]
 [    3 28450 37795 30320 27143]
 [    4 21243  5244 33678 46748]]

[[0.9999997  0.9092297  0.9029352  0.89236856 0.8832527 ]
 [1.0000007  0.860832   0.8401651  0.8376942  0.80558157]
 [1.0000004  0.87855345 0.8745349  0.87389517 0.8736162 ]
 [0.9999991  0.8559129  0.8513912  0.8447252  0.83801186]
 [1.0000011  0.68223965 0.6814203  0.67920333 0.67630756]]


In [79]:
# # PQ, Pruning Index
# index_path = os.path.join('./', 'sent_emb_pq.index')

# nlist = 100 # The number of cells (proportional to the number of centroids/vectors)
# m = 8 # The number of subquantizers

# quantizer = faiss.IndexFlatIP(d)
# index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)

# index.train(emb)
# # faiss.normalize_L2(total_embeddings.numpy()) # normalize embeddings 
# index.add(emb)

# faiss.write_index(index, index_path)