In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk, Features, Sequence, Value
import numpy as np
import os

def get_default_features():
    """
    기본 데이터셋의 Features 정의.
    """
    return Features({
        "id": Value("string"),
        "title": Value("string"),
        "context": Value("string"),
        "question": Value("string"),
        "answers": Sequence({
            "text": Value("string"),
            "answer_start": Value("int32")
        }),
    })

def transform_answers(answers):
    """
    answers['answer_start']를 int32로 변환합니다.
    기존의 answer_start 값을 보존합니다.
    """
    if not isinstance(answers, dict):
        return {"text": [], "answer_start": []}
    
    text = answers.get("text", [])
    answer_start = answers.get("answer_start", [])
    
    # answer_start가 리스트인지 확인하고, 그렇지 않다면 리스트로 변환
    if not isinstance(answer_start, list):
        answer_start = [answer_start]
    
    # answer_start 값이 None이 아닌지 확인
    answer_start = [a if a is not None else 0 for a in answer_start]
    
    # int32로 변환
    try:
        answer_start = [int(np.int32(a)) for a in answer_start]
    except Exception as e:
        print(f"Error converting answer_start values: {e}")
        answer_start = [0 for _ in answer_start]  # 오류 발생 시 0으로 채움

    return {
        "text": text,
        "answer_start": answer_start
    }

def transform_dataset(dataset_path, desired_features, output_path):
    """
    주어진 데이터셋을 기준 구조에 맞게 변환하여 저장합니다.
    """
    try:
        dataset = load_from_disk(dataset_path)
        transformed_splits = {}
        
        for split in dataset.keys():
            split_dataset = dataset[split]
            df = split_dataset.to_pandas()
            
            # 불필요한 열 제거
            existing_columns = df.columns.tolist()
            columns_to_keep = [col for col in desired_features.keys() if col in existing_columns]
            df = df[columns_to_keep]
            
            # 누락된 열 추가 (answers 제외)
            missing_columns = [col for col in desired_features.keys() if col not in existing_columns and col != 'answers']
            for col in missing_columns:
                if col == 'id':
                    df[col] = df.index.astype(str)  # 인덱스를 문자열로 변환하여 ID 생성
                else:
                    df[col] = ""  # 빈 문자열로 채움

            # 'answers' 필드 처리
            if 'answers' in df.columns:
                df['answers'] = df['answers'].apply(transform_answers)
            else:
                # 'answers' 필드가 없으면 기본값으로 추가
                df['answers'] = [{"text": [], "answer_start": []} for _ in range(len(df))]

            # 열 순서 정렬
            df = df[list(desired_features.keys())]

            # DataFrame을 원하는 Features로 변환
            transformed_split = Dataset.from_pandas(df, features=desired_features, preserve_index=False)
            transformed_splits[split] = transformed_split
        
        # DatasetDict 생성
        transformed_dataset = DatasetDict(transformed_splits)
        
        # 저장 경로 생성
        os.makedirs(output_path, exist_ok=True)
        
        # DatasetDict 저장
        transformed_dataset.save_to_disk(output_path)
        print(f"Transformed dataset saved to {output_path}")
    
    except Exception as e:
        print(f"Error processing dataset at {dataset_path}: {e}")

def main():
    # 변환할 데이터셋 목록
    datasets_to_transform = [
        "/data/ephemeral/home/sangyeop/level2-mrc-nlp-11/data/squad_kor_v1_filtered",
        "/data/ephemeral/home/sangyeop/level2-mrc-nlp-11/data/etri_mrc_filtered",
        "/data/ephemeral/home/sangyeop/level2-mrc-nlp-11/data/aug_new_filtered.0",
        "/data/ephemeral/home/sangyeop/level2-mrc-nlp-11/data/paraphrased_filtered"
    ]

    # 저장할 기본 경로
    output_base_path = "/data/ephemeral/home/sangyeop/level2-mrc-nlp-11/data/all_filtered_transformed"
    os.makedirs(output_base_path, exist_ok=True)
    
    # 기본 Features 정의
    desired_features = get_default_features()
    
    for dataset_path in datasets_to_transform:
        dataset_name = os.path.basename(dataset_path.rstrip('/'))
        output_path = os.path.join(output_base_path, dataset_name)
        print(f"Processing dataset: {dataset_name}")
        transform_dataset(dataset_path, desired_features, output_path)
        print("-" * 50)

if __name__ == "__main__":
    main()
