In [1]:
import os
import json
from datasets import load_from_disk
import time
import faiss
import pickle
import numpy as np
import pandas as pd

import hashlib 
import pprint

from tqdm.auto import tqdm
from contextlib import contextmanager
from typing import List, Tuple, NoReturn, Any, Optional, Union
from torch.utils.data import DataLoader, TensorDataset

from sklearn.feature_extraction.text import TfidfVectorizer

from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
)

from transformers import(
            BertModel, BertPreTrainedModel,
            AdamW, get_linear_schedule_with_warmup,
            TrainingArguments,AutoTokenizer
        )

In [2]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        return pooled_output



In [3]:
with open("../data/wikipedia_documents.json", "r", encoding="utf-8") as f:
    wiki = json.load(f)

wiki_contexts = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

In [4]:
per_device_eval_batch_size = 16
device = 'cuda'

In [5]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
p_encoder = BertEncoder.from_pretrained('p_encoder_dir').to('cuda')

In [6]:
wiki_seqs = tokenizer(
    wiki_contexts,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)
print(f'tokenizer complete')
wiki_dataset = TensorDataset(
    wiki_seqs["input_ids"],
    wiki_seqs["attention_mask"],
    wiki_seqs["token_type_ids"]
)
print(f'dataset complete')

tokenizer complete
dataset complete


In [7]:
wiki_dataloader = DataLoader(
    wiki_dataset,
    batch_size=per_device_eval_batch_size
)

In [8]:
def create_embedding():
    print(f'start')
    p_embedding_set = []
    with tqdm(wiki_dataloader, unit="batch") as tepoch:
        for batch in tepoch:
            p_inputs = {
                "input_ids": batch[0].view(per_device_eval_batch_size, -1).to(device),
                "attention_mask": batch[1].view(per_device_eval_batch_size, -1).to(device),
                "token_type_ids": batch[2].view(per_device_eval_batch_size, -1).to(device)
            }
            p_outputs = p_encoder(**p_inputs)
            # print(f'1. p_outpus : {p_outputs}')
            p_outputs = p_outputs.view(per_device_eval_batch_size, 1, -1)
            # print(f'2. p_outpus : {p_outputs}')
            p_embedding_set.append(p_outputs.detach().cpu().numpy())
    return p_embedding_set

In [9]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Oct 20 01:32:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:05.0 Off |                  Off |
| N/A   39C    P0    36W / 250W |   1513MiB / 32510MiB |      0%      Default |
|                               |            

In [10]:
p_embeding_set = create_embedding()

start


HBox(children=(FloatProgress(value=0.0, max=3547.0), HTML(value='')))




In [56]:
p_embeding_set[0].squeeze().shape

(16, 768)

In [57]:
test_arr = np.array(p_embeding_set)

In [58]:
test_arr = test_arr.squeeze()

In [60]:
test_arr.shape

(3547, 16, 768)

In [61]:
d3 = test_arr.shape[0]
d2 = test_arr.shape[1]
test_arr.shape[2]

768

In [62]:
re_test = np.resize(test_arr, (-1,test_arr.shape[2]))

In [64]:
re_test.shape

(56751, 768)

In [65]:
import pickle

In [66]:
with open('wiki_embedding.pikle', 'wb') as f:
    pickle.dump(re_test, f)

In [2]:
import code.retrieval

ModuleNotFoundError: No module named 'code.retrieval'; 'code' is not a package