In [None]:
import os
import json
import pandas as pd
import numpy as np
from typing import List

from huggingface_hub import login
from transformers import AutoTokenizer
import datasets
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets, load_from_disk
from dotenv import load_dotenv

load_dotenv()


login(token= os.environ["HF_TOKEN"])

# 캐시 디렉토리 설정
DATA_CACHE_DIR = "/mnt/t7/.cache/huggingface/datasets"
MODEL_CACHE_DIR = "/mnt/t7/.cache/huggingface/models"

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
dataset_name = "PrompTartLAB/PTT_advanced_en_ko"

ds = load_dataset(dataset_name, cache_dir=DATA_CACHE_DIR)

README.md:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

PTT_advanced.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [5]:
ds["train"]

Dataset({
    features: ['term', 'english', 'korean'],
    num_rows: 1063
})

In [8]:
ds["train"][1062]

{'term': 'neural task-driven modeling',
 'english': 'Neural task-driven modeling plays a crucial role in understanding complex events (CEs) from sensor data, enabling robots and autonomous systems to interact effectively with their environments and humans. Recent advancements in neuro-symbolic methods, which combine neural task-driven modeling with symbolic reasoning, have demonstrated superior performance in complex event detection (CED) by leveraging human knowledge and reducing the need for extensive data. Our research highlights the significant potential of neuro-symbolic architectures over traditional neural task-driven models, particularly in recognizing CE patterns from multimodal data streams with improved temporal reasoning capabilities.',
 'korean': '신경 과제 주도 모델링(neural task-driven modeling)은 센서 데이터로부터 복잡한 사건(CEs)을 이해하는 데 중요한 역할을 하며, 로봇과 자율 시스템이 환경 및 인간과 효과적으로 상호작용할 수 있도록 합니다. 신경 과제 주도 모델링(neural task-driven modeling)과 상징적 추론을 결합한 신경-상징적 방법의 최근 발전은 인간의 지식을 활용하고 광범위한 데이터의 필요성을

In [11]:
jason_filepath = "/mnt/t7/dnn/llm_practicing/06_synthetic_data/00_PTT_with_Latex/dataset_new_turn_1/dataset_new_turn_1_1.json"

with open(jason_filepath, "r") as f:
    data = json.load(f)

In [15]:
filtered_rows = [
    {
        "term": row["term"],
        "english": row["english"],
        "korean": row["korean"]
    }
    for row in data
    if "term" in row and "english" in row and "korean" in row
]

In [22]:
# 4. 새 데이터셋 객체로 변환
new_ds = Dataset.from_list(filtered_rows)

# 5. 기존 train 데이터셋과 병합 (concat)
updated_train = concatenate_datasets([ds["train"], new_ds])

In [23]:
# 6. 다시 DatasetDict로 포장
dataset = DatasetDict({"train": updated_train})

In [24]:
repo_name = "aeolian83/PTT_with_Latex_1"
dataset.push_to_hub(repo_name, private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/aeolian83/PTT_wit_Latex_1/commit/c0a20c5a53382aa9dcd9f5bae24dd9698720be44', commit_message='Upload dataset', commit_description='', oid='c0a20c5a53382aa9dcd9f5bae24dd9698720be44', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/aeolian83/PTT_wit_Latex_1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='aeolian83/PTT_wit_Latex_1'), pr_revision=None, pr_num=None)

In [25]:
dataset["train"][1062]

{'term': 'neural task-driven modeling',
 'english': 'Neural task-driven modeling plays a crucial role in understanding complex events (CEs) from sensor data, enabling robots and autonomous systems to interact effectively with their environments and humans. Recent advancements in neuro-symbolic methods, which combine neural task-driven modeling with symbolic reasoning, have demonstrated superior performance in complex event detection (CED) by leveraging human knowledge and reducing the need for extensive data. Our research highlights the significant potential of neuro-symbolic architectures over traditional neural task-driven models, particularly in recognizing CE patterns from multimodal data streams with improved temporal reasoning capabilities.',
 'korean': '신경 과제 주도 모델링(neural task-driven modeling)은 센서 데이터로부터 복잡한 사건(CEs)을 이해하는 데 중요한 역할을 하며, 로봇과 자율 시스템이 환경 및 인간과 효과적으로 상호작용할 수 있도록 합니다. 신경 과제 주도 모델링(neural task-driven modeling)과 상징적 추론을 결합한 신경-상징적 방법의 최근 발전은 인간의 지식을 활용하고 광범위한 데이터의 필요성을

In [26]:
dataset["train"][1063]

{'term': 'adaptive neural networks',
 'english': 'The development of adaptive neural networks has demonstrated significant potential in addressing the exponential complexity associated with cyclic scheduling in real-world applications. By employing a neural network approach to model manufacturing as a cyclic job shop problem, the study effectively minimized the cycle time of a schedule, showcasing the flexibility and adaptability of the system to integrate with other techniques. Notably, the experimental results validated the approach, confirming the hypothesis that adaptive neural networks can efficiently optimize the scheduling process, particularly when minimizing the cycle time, \\( T_{\\text{cycle}} \\), is critical.',
 'korean': '적응형 신경망(adaptive neural networks)의 개발은 실제 응용에서 순환 일정의 지수적 복잡성을 해결하는 데 상당한 잠재력을 보여주었습니다. 제조를 순환 작업장 문제로 모델링하기 위해 신경망 접근 방식을 사용함으로써 연구는 일정의 주기 시간을 효과적으로 최소화하였으며, 시스템이 다른 기법과 통합될 수 있는 유연성과 적응성을 입증했습니다. 특히, 실험 결과는 이 접근 방식을 검증하여 적응형 신경망(adaptive neural networ