In [1]:
"""
국립 국어원 모두의 말뭉치에서 제공하는 개체명 인식 데이터 featuring pipe입니다.
"""
import json
import os
from typing import Dict, List

import pandas as pd


class MoongchiJsonDataIntegrateTransfer:

    def __init__(self):
        pass

    def __call__(self, *args, **kwargs):
        pass

    def etl(self, input_json_data_dir_path: str, output_dir_path: str, batch_size=100000):

        documents = self.__extract(input_json_data_dir_path)

        batch_idx = 0
        batch_buffer = []

        for document in documents:

            sentence_features = self.__parse_doc_to_sentences(document)

            batch_tmp_len = len(batch_buffer)
            sentence_features_len = len(sentence_features)
            if batch_tmp_len + sentence_features_len < batch_size:
                batch_buffer.extend(sentence_features)
            else:
                # batch에 도달하면 저장
                add_end_idx = batch_size - batch_tmp_len
                slice_sentence_features = sentence_features[:add_end_idx]
                batch_buffer.extend(slice_sentence_features)
                self.__load(output_dir_path, batch_idx, batch_buffer)
                batch_idx += 1

                # buffer 비우기
                batch_buffer.clear()

                # 안넣은 데이터 buffer 저장
                batch_buffer.extend(slice_sentence_features[add_end_idx:])

        # batch에 남아 있는 데이터 load
        if len(batch_buffer) > 0:
            self.__load(output_dir_path, batch_idx, batch_buffer)

    def __extract(self, json_data_dir_path: str):
        documents = []
        for file_name in os.listdir(json_data_dir_path):
            if file_name.find(".json") | file_name.find(".JSON"):
                file_path = os.path.join(json_data_dir_path, file_name)
                with open(file_path, "rb") as json_file:
                    file_documents = json.load(json_file)["document"]
                    documents.extend(file_documents)
        return documents

    def __parse_doc_to_sentences(self, document: Dict) -> List[Dict]:

        document_id = document['id']
        document_sentence_infos = document['sentence']

        sentence_infos = map(self.__change_sentence_info_names, document_sentence_infos)
        sentence_infos = list(sentence_infos)

        for sentence_info in sentence_infos:
            sentence_info["document_id"] = document_id

        return sentence_infos

    def __change_sentence_info_names(self, sentence_info: Dict) -> Dict:
        sentence_id = sentence_info['id']
        sentence_text = sentence_info['form']
        sentence_word_infos = sentence_info['word']
        sentence_ne_infos = sentence_info['NE']

        sentence_info_of_lucy = {
            "sentence_id": sentence_id,
            "sentence_text": sentence_text,
            "sentence_word_infos": sentence_word_infos,
            "sentence_ne_infos": sentence_ne_infos,
        }

        return sentence_info_of_lucy

    def __load(self, output_dir_path: str, idx: int, sentence_features: List[Dict]):
        save_path = os.path.join(output_dir_path, str(idx) + ".parquet")

        print(f"saving...{save_path}")
        sentence_features_table = pd.DataFrame(sentence_features)
        sentence_features_table.to_parquet(save_path, engine="fastparquet")


In [2]:
transfer = MoongchiJsonDataIntegrateTransfer()

In [3]:
json_dir_path = '/home/woohyun/NER/data/k_corpus/raw_data/all/'
output_dir_path = '/home/woohyun/NER/data/k_corpus/parquet_data/'

In [4]:
transfer.etl(json_dir_path, output_dir_path)

saving.../home/woohyun/NER/data/k_corpus/parquet_data/0.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/1.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/2.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/3.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/4.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/5.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/6.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/7.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/8.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/9.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/10.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/11.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/12.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/13.parquet
saving.../home/woohyun/NER/data/k_corpus/parquet_data/14.parquet
saving.../home/woohyun/NER/data/k_c

In [6]:
import pandas as pd
temp = pd.read_parquet('/home/woohyun/NER/data/k_corpus/parquet_data/0.parquet', engine='fastparquet')

In [7]:
temp

Unnamed: 0,sentence_id,sentence_text,sentence_word_infos,sentence_ne_infos,document_id
0,SBRW1900007716.1.1.1,책으로 세상을 바라보고 책으로 삶을 살찌우는 시간 티비 책방 북소리 북마스터 김혜집니다.,"[{'id': 1, 'form': '책으로', 'begin': 0, 'end': 3...","[{'id': 1, 'form': '티비 책방 북소리', 'label': 'AFA_...",SBRW1900007716.1
1,SBRW1900007716.1.1.2,네. 안녕하세요?,"[{'id': 1, 'form': '네.', 'begin': 0, 'end': 2}...",[],SBRW1900007716.1
2,SBRW1900007716.1.1.3,한 주 동안 잘 지내셨죠?,"[{'id': 1, 'form': '한', 'begin': 0, 'end': 1},...","[{'id': 1, 'form': '한 주 동안', 'label': 'DT_DURA...",SBRW1900007716.1
3,SBRW1900007716.1.1.4,북마스터 국민대 이창현 교숩니다.,"[{'id': 1, 'form': '북마스터', 'begin': 0, 'end': ...","[{'id': 1, 'form': '북마스터', 'label': 'CV_OCCUPA...",SBRW1900007716.1
4,SBRW1900007716.1.1.5,네. 책 읽고 글 쓰는 문화 평론가 허흽니다.,"[{'id': 1, 'form': '네.', 'begin': 0, 'end': 2}...","[{'id': 1, 'form': '문화 평론가', 'label': 'CV_OCCU...",SBRW1900007716.1
...,...,...,...,...,...
99995,ESRW1907002383.1500.2.1,그리웠어..,"[{'id': 1, 'form': '그리웠어..', 'begin': 0, 'end'...",[],ESRW1907002383.1500
99996,ESRW1907002383.1502.1.1,보자마자 눈물부터 나더라구요..,"[{'id': 1, 'form': '보자마자', 'begin': 0, 'end': ...",[],ESRW1907002383.1502
99997,ESRW1907002383.1505.1.1,선배 ...,"[{'id': 1, 'form': '선배', 'begin': 0, 'end': 2}...",[],ESRW1907002383.1505
99998,ESRW1907002383.1505.2.1,기억나요.....?,"[{'id': 1, 'form': '기억나요.....?', 'begin': 0, '...",[],ESRW1907002383.1505
