In [1]:
import os
import json
import pickle
import random
import time
import random
from contextlib import contextmanager
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
from transformers import (
    AutoTokenizer,
    BertModel, BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f"[{name}] done in {time.time() - t0:.3f} s")

In [3]:
from dpr import DenseRetrieval, BertEncoder, RobertaEncoder

In [4]:
from datasets import load_from_disk, concatenate_datasets, DatasetDict

datasets = load_from_disk("/data/ephemeral/level2-mrc/data/train_dataset")
datasets['train'] = datasets['train'].remove_columns(['document_id','__index_level_0__'])
datasets['validation'] = datasets['validation'].remove_columns(['document_id','__index_level_0__'])
korquad = load_dataset('squad_kor_v1', features=datasets["train"].features)
datasets = DatasetDict({
    'train' : concatenate_datasets([datasets['train'], korquad['validation'], datasets['validation']]),
    'validation' : korquad['train'].select(np.random.randint(len(korquad['train']), size=5000)),
})
datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 9966
    })
    validation: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 5000
    })
})

In [5]:
# 데이터셋과 모델은 아래와 같이 불러옵니다.
train_dataset = datasets['train']

args = TrainingArguments(
    output_dir="dense_retireval",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01
)

In [6]:
p_encoder_path = 'thingsu/koDPR_context'
q_encoder_path = 'thingsu/koDPR_question'
tokenizer_path = 'kykim/bert-kor-base'

# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, cache_dir='/data/ephemeral/huggingface')
p_encoder = BertEncoder.from_pretrained(p_encoder_path, cache_dir='/data/ephemeral/huggingface').to(args.device)
q_encoder = BertEncoder.from_pretrained(q_encoder_path, cache_dir='/data/ephemeral/huggingface').to(args.device)

# p_encoder = RobertaEncoder.from_pretrained(p_encoder_path, cache_dir='/data/ephemeral/huggingface').to(args.device)
# q_encoder = RobertaEncoder.from_pretrained(q_encoder_path, cache_dir='/data/ephemeral/huggingface').to(args.device)

In [12]:
# # For Hub Upload
# PMODEL_SAVE_REPO = 'klue-bert-base-context'
# QMODEL_SAVE_REPO = 'klue-bert-base-question'
# API_KEY = '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'

# p_encoder.bert.push_to_hub(
#     PMODEL_SAVE_REPO , 
#     use_temp_dir=True, 
#     use_auth_token=API_KEY
# )

# q_encoder.bert.push_to_hub(
#     QMODEL_SAVE_REPO , 
#     use_temp_dir=True, 
#     use_auth_token=API_KEY
# )

model.safetensors: 100%|██████████| 442M/442M [00:31<00:00, 14.0MB/s]   
model.safetensors: 100%|██████████| 442M/442M [00:16<00:00, 26.1MB/s] 


CommitInfo(commit_url='https://huggingface.co/CurtisJeon/klue-bert-base-question/commit/9b37d4f61fe9c85607459a986e04568fd12231af', commit_message='Upload model', commit_description='', oid='9b37d4f61fe9c85607459a986e04568fd12231af', pr_url=None, pr_revision=None, pr_num=None)

In [7]:
# Retriever는 아래와 같이 사용할 수 있도록 코드를 짜봅시다.
retriever = DenseRetrieval(
    args=args,
    dataset=train_dataset,
    num_neg=2,
    tokenizer=tokenizer,
    p_encoder=p_encoder,
    q_encoder=q_encoder,
    do_train=True,
)

In [None]:
retriever.train()

In [27]:
# retriever.args.num_train_epochs = 1

In [None]:
# in-batch
retriever.in_batch_train(valid_dataset=datasets['validation'])

In [9]:
retriever.prepare_validation(datasets['validation'])
retriever.evaluate()

Validation: 100%|██████████| 5000/5000 [02:38<00:00, 31.46it/s]


Top1, Top5, Top10 Accuracy : (0.3448, 0.6796, 0.7694)


(0.3448, 0.6796, 0.7694)

In [None]:
retriever.get_dense_embeddings('../data/dense.bin', corpus_path='../data/wiki_preprocessed_v2.json')

In [None]:
query = "성아와 희민이 나오는 작품 제목은?"
results = retriever.get_relevant_doc(query=query, k=5)

In [None]:
results

In [None]:
print(f"[Search Query] {query}")

indices = results[1]
for i, idx in enumerate(indices):
    print(f"Top-{i + 1}th Passage (Index {idx})")
    print(retriever.contexts[idx])

In [None]:
queries = ["이태영", '옥수수']
results = retriever.get_relevant_doc_bulk(queries=queries, k=5)

In [None]:
results

In [None]:
test_dataset = load_from_disk('../data/test_dataset')

In [None]:
df = retriever.retrieve(test_dataset['validation'], topk=10)

In [10]:
retriever.p_encoder.bert.save_pretrained('/data/ephemeral/huggingface/p_encoder_roberta')

In [11]:
retriever.q_encoder.bert.save_pretrained('/data/ephemeral/huggingface/q_encoder_roberta')