In [14]:
import pickle
import json

# Path to the pickle file
pickle_file_path = 'document.pkl'

# Path to save the JSON file
json_file_path = 'pdf_info.json'

# Load the pickle file
with open(pickle_file_path, 'rb') as f:
    documents = pickle.load(f)

# print(f"Pickle file successfully converted to JSON and saved to {json_file_path}")


KeyError: '__fields_set__'

In [13]:
import pickle
import zipfile
import os
import shutil
from pydantic import BaseModel

def load_document_from_zip(zip_filename, extraction_dir="data"):
    """Extract and load the document.pkl from a zip file, handling missing `__fields_set__`."""
    temp_dir = os.path.join(extraction_dir, "temp_files")
    os.makedirs(temp_dir, exist_ok=True)

    # Step 1: Extract the zip file
    with zipfile.ZipFile(zip_filename, 'r') as zipf:
        zipf.extractall(temp_dir)
    
    # Step 2: Load the document.pkl file
    document_path = os.path.join(temp_dir, "document.pkl")
    if not os.path.exists(document_path):
        raise FileNotFoundError(f"Document file not found: {document_path}")

    try:
        with open(document_path, "rb") as f:
            documents = pickle.load(f)
    except KeyError as e:
        print(f"KeyError encountered while loading document: {e}")
        print("Attempting to fix missing `__fields_set__`")

        # Handle the missing `__fields_set__` attribute for Pydantic models
        with open(document_path, "rb") as f:
            documents = pickle.load(f)  # Load again
            # Check if it's a Pydantic model and manually add the `__fields_set__` attribute
            for doc in documents:
                if isinstance(doc, BaseModel):
                    # Manually add the missing `__fields_set__` attribute
                    doc.__fields_set__ = set()  # Recreate the `__fields_set__` attribute manually
            print("Manually fixed __fields_set__ missing error.")
    
    # Step 3: Print the contents of the documents
    print("Documents Contents:")
    for doc in documents:
        print(doc)  # Print the document's content (can be customized based on document structure)
    
    # Step 4: Clean up the temporary directory
    shutil.rmtree(temp_dir)
    
    return documents


# Example usage
zip_filename = 'vector_store_large.zip'  # Replace with your actual ZIP file path
extraction_dir = 'data'  # Directory where the files will be extracted

# Load the document.pkl from the ZIP
documents = load_document_from_zip(zip_filename, extraction_dir)

# Now, you have the `documents` object, and its contents are printed out.


KeyError encountered while loading document: '__fields_set__'
Attempting to fix missing `__fields_set__`


KeyError: '__fields_set__'

In [15]:
import json
import pandas as pd
from pathlib import Path

# 1) JSON 파일 경로 지정
json_path = Path("gemini_combined.json")   # 경로가 다르면 수정하세요.

# 2) 파일 존재 여부 확인
if not json_path.exists():
    raise FileNotFoundError(f"{json_path} Not found. Please check the path.")

# 3) JSON 로드
with open(json_path, "r", encoding="utf-8") as jf:
    data = json.load(jf)      # data 는 list[dict] 구조

# 4) ◾ 전체 구조 대충 확인
print("첫 항목 예시:", json.dumps(data[0], ensure_ascii=False, indent=2)[:1000], "...\n")



첫 항목 예시: {
  "Drawing_Type": "Elevation",
  "Purpose_of_Building": "Residential",
  "Client_Name": "둔촌주공아파트주택 재건축정비사업조합",
  "Project_Title": "둔촌주공아파트 주택재건축정비사업",
  "Drawing_Title": "분산상가-1 입면도-2 (근린생활시설-3)",
  "Space_Classification": {
    "Communal": [],
    "Private": [],
    "Service": []
  },
  "Details": {
    "Drawing_Number": "A51-2012",
    "Project_Number": "N/A",
    "Revision_Number": 0,
    "Scale": "A1 : 1/100, A3 : 1/200",
    "Architects": [
      "Unknown"
    ]
  },
  "Additional_Details": {
    "Number_of_Units": 0,
    "Number_of_Stairs": 0,
    "Number_of_Elevators": 0,
    "Number_of_Hallways": 0,
    "Unit_Details": [],
    "Stairs_Details": [],
    "Elevator_Details": [],
    "Hallways": [],
    "Other_Common_Areas": []
  },
  "Notes_on_Drawing": "N/A",
  "Table_on_Drawing": "N/A"
} ...



In [16]:

# 5) ◾ 판다스로 변환해 테이블 형태로 보기
df = pd.json_normalize(data)        # 중첩 필드가 많으면 일부 컬럼만 고르셔도 됩니다.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 120)
df.head()      # → Jupyter 에서는 표 형태로 렌더링

Unnamed: 0,Drawing_Type,Purpose_of_Building,Client_Name,Project_Title,Drawing_Title,Notes_on_Drawing,Table_on_Drawing,Space_Classification.Communal,Space_Classification.Private,Space_Classification.Service,Details.Drawing_Number,Details.Project_Number,Details.Revision_Number,Details.Scale,Details.Architects,Additional_Details.Number_of_Units,Additional_Details.Number_of_Stairs,Additional_Details.Number_of_Elevators,Additional_Details.Number_of_Hallways,Additional_Details.Unit_Details,Additional_Details.Stairs_Details,Additional_Details.Elevator_Details,Additional_Details.Hallways,Additional_Details.Other_Common_Areas
0,Elevation,Residential,둔촌주공아파트주택 재건축정비사업조합,둔촌주공아파트 주택재건축정비사업,분산상가-1 입면도-2 (근린생활시설-3),,,[],[],[],A51-2012,,0,"A1 : 1/100, A3 : 1/200",[Unknown],0,0,0,0,[],[],[],[],[]
1,Floor_Plan,Commercial,둔촌주공아파트주택 재건축정비사업조합,둔촌주공아파트 주택재건축정비사업,분산상가-1 배치도 (근린생활시설-3),1. 담장설치는 단지내부가보이는 투시형구조나 생울타리로 하고 지구단위계획 등에 따라 미설치 될 수 있음.\n2. 단지내 및 단지주변 계획고는 공사시 실측현황에 따라 변경될 수 있음.\n3. 소방차 주차공간 내...,,[],[],[],A51-2000,,0,"A1 : 1/100, A3 : 1/200",[Unknown],0,0,0,0,[],[],[],[],[]
2,Section_View,Commercial,둔촌주공아파트주택 재건축정비사업조합,둔촌주공아파트 주택재건축정비사업,분산 상가-1 단면도-4 (근린생활시설-3),"1. 옥상 줄눈의 간격 등은 실시공시 변경될 수 있음.\n2. 옥상 줄눈 틈에 실란트 시공되지 않음.\n3. 지붕의 재료, 형태, 구조는 실시공시 변경될 수 있음.\n4. 지붕층 난간의 형태와 설치 위치는 안...",,[],[],[],A51-2024,,0,"A1 : 1/100, A3 : 1/200",[Unknown],0,0,0,0,[],[],[],[],[]
3,Elevation,Residential,둔촌주공아파트주택 재건축정비사업조합,둔촌주공아파트 주택재건축정비사업,분산상가-1 입면도-1 (근린생활시설-3),,,[],[],[],A51-2011,,0,"A1 : 1/100, A3 : 1/200",[Unknown],0,0,0,0,[],[],[],[],[]
4,Floor_Plan,Residential,둔촌주공아파트주택 재건축정비사업조합,둔촌주공아파트 주택재건축정비사업,분산상가-1 지붕 평면도 (근린생활시설-3),1. 층별 LEVEL 기준 - 지붕층 LEVEL - SL±0 = FL±0 = EL+38.70\n2. 옥상 쭁눌의 간격 등은 실시공시 변경될 수 있음.\n3. 옥상 쭁눌 등에는 실란드 시공되지 않음.\n4. 지...,,[],[],[],A51-2009,,0,"A1 : 1/100, A3 : 1/200",[Unknown],0,0,0,0,[],[],[],[],[]


In [17]:
df.to_csv("gemini_combined.csv", index=False, encoding="utf-8-sig")  # CSV로 저장