In [2]:
import os
import sys
import time
import pickle
import random
import logging

import wandb
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from datasets import load_metric, load_from_disk, load_dataset
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer, AdamW
from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)

from my_model import Mymodel
from utils_qa import postprocess_qa_predictions, check_no_error, tokenize, AverageMeter
from trainer_qa import QuestionAnsweringTrainer
from retrieval import SparseRetrieval
from arguments import ModelArguments, DataTrainingArguments
from data_processing import DataProcessor
from prepare_dataset import make_custom_dataset

In [3]:
def get_pickle(pickle_path):
    '''Custom Dataset을 Load하기 위한 함수'''
    f = open(pickle_path, "rb")
    dataset = pickle.load(f)
    f.close()

    return dataset

In [4]:
text_data = get_pickle("/opt/ml/input/data/train_concat5.pkl")

In [13]:
text_data["train"]

Dataset({
    features: ['answers', 'context', 'id', 'question'],
    num_rows: 3952
})

In [19]:
from konlpy.tag import Mecab 
import re
from datasets import Dataset
mecab = Mecab()

In [20]:
def random_masking(datasets):
    context_list = []
    question_list = []
    id_list = []
    answer_list = []
    
    # train 갯수만큼 iteration
    for i in tqdm(range(datasets["train"].num_rows)):
        text = datasets["train"][i]["question"]
        
        # 단어 기준 Masking
        for word, pos in mecab.pos(text):
            first_word = True
            # 첫번째 단어는 무조건 Masking(질문 중 가장 중요한 의미를 가지고 있다고 생각)
            # 두번째 단어부터는 20% 확률로 Masking
            # 하나의 단어만 Masking
            if pos in {"NNG", "NNP"} and (first_word or random.random() > 0.8):
                first_word = False
                context_list.append(datasets["train"][i]["context"])
                question_list.append(re.sub(word, "MASK", text)) # tokenizer.mask_token
                id_list.append(datasets["train"][i]["id"])
                answer_list.append(datasets["train"][i]["answers"])

    # list를 Dataset 형태로 변환
    datasets["train"] = Dataset.from_dict({"id" : id_list,
                                           "context": context_list, 
                                           "question": question_list,
                                           "answers": answer_list})

    return datasets["train"] # 3000 => 20000

In [21]:
random_masking(text_data)

100%|██████████| 3952/3952 [00:07<00:00, 544.21it/s]


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 20778
})

In [28]:
from konlpy.tag import Mecab, Kkma, Hannanum

mecab = Mecab()
kkma = Kkma()
hannanum = Hannanum()

In [53]:
texts = ["스코틀랜드 계몽주의", "언더우드(Underwood)박사와", "조선의", "나의 노래의", "정원 초과와 과적이", "내각 회의"]

In [54]:
for text in texts:
    

[('스코틀랜드', 'NNP'), ('계몽주의', 'NNG')]
[('스코', 'UN'), ('틀', 'NNG'), ('랜드', 'NNG'), ('계몽주의', 'NNG')]
[('스코틀랜드', 'N'), ('계몽주의', 'N')]
4
스코틀랜드 계몽주의
[('언더우드', 'NNP'), ('(', 'SSO'), ('Underwood', 'SL'), (')', 'SSC'), ('박사', 'NNG'), ('와', 'JC')]
[('언더', 'NNG'), ('우드', 'NNG'), ('(', 'SS'), ('Underwood', 'OL'), (')', 'SS'), ('박사', 'NNG'), ('와', 'JC')]
[('언더우드', 'N'), ('(', 'S'), ('Underwood', 'F'), (')', 'S'), ('박사', 'N'), ('와', 'J')]
언더우드(Underwood)박사
[('조선', 'NNP'), ('의', 'JKB')]
[('조선', 'NNG'), ('의', 'JKG')]
[('조선', 'N'), ('의', 'J')]
1
조선
[('나', 'NP'), ('의', 'JKG'), ('노래', 'NNG'), ('의', 'NNG')]
[('나의', 'NNG'), ('노래', 'NNG'), ('의', 'JKG')]
[('나', 'N'), ('의', 'J'), ('노래', 'N'), ('의', 'J')]
1
나의 노래
[('정원', 'NNG'), ('초과', 'NNG'), ('와', 'JC'), ('과', 'NNG'), ('적', 'XSN'), ('이', 'VCP')]
[('정원', 'NNG'), ('초과', 'NNG'), ('와', 'JKM'), ('과적', 'NNG'), ('이', 'JKS')]
[('정원', 'N'), ('초과', 'N'), ('와', 'J'), ('과', 'N'), ('적', 'X'), ('이', 'J')]
정원 초과와 과적
[('내각', 'NNP'), ('회의', 'NNG')]
[('내각', 'NNG'), ('회의', 'NNG