In [11]:
import json

def extract_first_json_object(text):
    """
    주어진 텍스트에서 첫 번째 JSON 객체(중괄호로 묶인 부분)를 추출합니다.
    문자열 내부의 따옴표와 이스케이프 문자를 고려하여 중괄호의 균형을 체크합니다.
    """
    start_index = text.find('{')
    if start_index == -1:
        raise ValueError("JSON 객체를 찾을 수 없습니다.")
    
    brace_count = 0
    in_string = False
    escape = False

    for i in range(start_index, len(text)):
        char = text[i]
        
        # 문자열 내부 체크 (이스케이프 고려)
        if char == '"' and not escape:
            in_string = not in_string

        if not in_string:
            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1
                if brace_count == 0:
                    # 첫 번째 JSON 객체 완성: 시작부터 i까지
                    return text[start_index:i+1]
        
        # 이스케이프 처리: '\' 다음 문자는 특별한 의미 없음
        if char == '\\' and not escape:
            escape = True
        else:
            escape = False

    raise ValueError("완전한 JSON 객체를 찾지 못했습니다.")

# JSON 파일 경로
json_file_path = 'database/crawler_output_page1.json'

# 파일 전체 내용을 읽습니다.
with open(json_file_path, 'r', encoding='utf-8') as f:
    content = f.read().strip()

# 첫 번째 JSON 객체 추출 및 파싱
try:
    first_book_str = extract_first_json_object(content)
    first_book = json.loads(first_book_str)
    print("첫 번째 책 데이터:")
    book1 = json.dumps(first_book, ensure_ascii=False, indent=2)
except Exception as e:
    print("JSON 추출/파싱 오류:", e)


print(book1)


첫 번째 책 데이터:
{
  "book_name": "The Hunger Games",
  "book_image": "https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1586722975i/2767052.jpg",
  "author": "Suzanne Collins",
  "total_star": "4.34",
  "published": "First published September 14, 2008",
  "pages": "First published September 14, 2008",
  "description": "Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning?\n\nIn the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.\n\nSixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's plac

In [16]:
from sentence_transformers import SentenceTransformer

#book1 = json.loads(book1)

# SentenceTransformer 모델 로드 (예시: all-MiniLM-L6-v2)
model = SentenceTransformer('all-MiniLM-L6-v2')

# 각 리뷰의 임베딩을 계산하여 저장할 리스트
review_embeddings = []

print("각 리뷰의 임베딩:")
for i, review_item in enumerate(book1.get("reviews", [])):
    review_text = review_item.get("review", "")
    if review_text:
        embedding = model.encode(review_text)
        review_embeddings.append(embedding)
        # 전체 임베딩 벡터가 길 수 있으므로, 앞의 5개 값만 출력합니다.
        print(f"review{i+1} : {embedding[:5]} ...")

# 리뷰 임베딩들의 평균을 계산하여 책 임베딩(책 벡터)으로 활용
if review_embeddings:
    review_embeddings = np.array(review_embeddings)
    book_embedding = np.mean(review_embeddings, axis=0)
    print("\n책 임베딩 (평균):")
    print(book_embedding)
else:
    print("리뷰 임베딩을 계산할 수 없습니다.")

각 리뷰의 임베딩:
review1 : [-0.01234674 -0.02381009  0.10658932  0.00543345 -0.03751432] ...
review2 : [ 0.02034232 -0.01608867  0.02126512 -0.06065462 -0.03981457] ...
review3 : [ 0.04129183 -0.00793601  0.01407586 -0.08976207 -0.0519838 ] ...
review4 : [ 0.01787456 -0.00039276  0.03605605 -0.04279175 -0.05622929] ...
review5 : [ 0.00076623 -0.05236714  0.04906528 -0.00307945 -0.01747171] ...
review6 : [ 0.02617341  0.01006066 -0.07070003 -0.02034634  0.00938551] ...
review7 : [-0.06688815 -0.01043285  0.04848652 -0.04208755  0.04908629] ...
review8 : [-0.0948926   0.06352112  0.05926307  0.03750743  0.03037364] ...
review9 : [-0.08600448  0.04299655 -0.00097909 -0.06538235 -0.1433155 ] ...
review10 : [ 0.04592688 -0.01154701 -0.0273668  -0.02738163 -0.02892835] ...
review11 : [ 0.01479283  0.01023902  0.05634894 -0.01207443 -0.03094055] ...
review12 : [ 0.0267567   0.0099439  -0.01821836 -0.00947593  0.00349268] ...
review13 : [ 0.00757937 -0.07706585  0.0435085  -0.00267675  0.04713159] .