In [5]:
# prompt: drive mount  할래

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# prompt: /content/drive/MyDrive/학회/ybigta/신기프/code/nonfiction_embedding.csv
# 파일을 df로 가져올래

import pandas as pd

file_path = '/content/drive/MyDrive/학회/ybigta/신기프/code/nonfiction_embedding.csv'
df = pd.read_csv(file_path)


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   book_name  81 non-null     object
 1   genre      81 non-null     object
 2   embedding  81 non-null     object
dtypes: object(3)
memory usage: 2.0+ KB


In [14]:
df['embedding'] = df['embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df

Unnamed: 0,book_name,genre,embedding
0,A Short History of Nearly Everything,"['Nonfiction', 'Science', 'History', 'Audioboo...","[-0.033360451459884644, 0.01089590322226286, 0..."
1,"Thinking, Fast and Slow","['Nonfiction', 'Psychology', 'Self Help', 'Bus...","[0.011774116195738316, -0.01793689653277397, -..."
2,Quiet: The Power of Introverts in a World That...,"['Nonfiction', 'Psychology', 'Self Help', 'Aud...","[-0.028620585799217224, -0.060751207172870636,..."
3,Factfulness: Ten Reasons We're Wrong About the...,"['Nonfiction', 'Science', 'Psychology', 'Econo...","[0.010631420649588108, 0.0004959274083375931, ..."
4,The Power of Habit: Why We Do What We Do in Li...,"['Nonfiction', 'Self Help', 'Psychology', 'Bus...","[-0.013105737045407295, -0.01798441633582115, ..."
...,...,...,...
76,The Poisoner's Handbook: Murder and the Birth ...,"['Nonfiction', 'History', 'Science', 'True Cri...","[-0.00828291941434145, 0.005662049166858196, -..."
77,Born a Crime: Stories From a South African Chi...,"['Nonfiction', 'Memoir', 'Biography', 'Audiobo...","[-0.03137453272938728, 0.03304319828748703, -0..."
78,The Day of Battle: The War in Sicily and Italy...,"['History', 'Nonfiction', 'Military History', ...","[-0.07813172787427902, 0.014277449809014797, -..."
79,Dead Wake: The Last Crossing of the Lusitania,"['Nonfiction', 'History', 'Audiobook', 'Histor...","[-0.01616678014397621, -0.03029453381896019, -..."


In [15]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import ast
# --- 1. 전처리: 'Nonfiction' 제거 ---
def remove_nonfiction(genres):
    if isinstance(genres, list):
        return [g for g in genres if g != 'Nonfiction']
    return genres

# df는 이미 'book_name', 'genre', 'embedding' 컬럼을 갖고 있다고 가정합니다.
df['genre'] = df['genre'].apply(remove_nonfiction)

# --- 2. Contrastive Loss 학습 데이터 구성 ---
def convert_embedding_str_to_list(s):
    """
    embedding 문자열 s가 쉼표 없이 공백으로 구분된 숫자 문자열일 경우,
    이를 Python 리스트로 변환합니다.
    """
    # 양쪽 공백 및 대괄호 제거
    s = s.strip().strip("[]")
    # 공백을 기준으로 분리
    elements = s.split()
    # 각 원소를 float으로 변환
    return [float(x) for x in elements]


class BookPairsDataset(Dataset):
    def __init__(self, df, num_pairs=2000):
        self.df = df.reset_index(drop=True)
        self.num_books = len(self.df)
        self.pairs = []
        self.labels = []
        np.random.seed(42)
        for _ in range(num_pairs):
            i, j = np.random.choice(self.num_books, 2, replace=False)
            genres_i = set(self.df.loc[i, 'genre'])
            genres_j = set(self.df.loc[j, 'genre'])
            # 두 책이 공유하는 장르가 있으면 positive, 없으면 negative
            label = 1 if len(genres_i.intersection(genres_j)) > 0 else 0
            self.pairs.append((i, j))
            self.labels.append(label)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        i, j = self.pairs[idx]
        emb_i = self.df.loc[i, 'embedding']
        emb_j = self.df.loc[j, 'embedding']

        # embedding 값이 문자열인 경우 처리
        if isinstance(emb_i, str):
            # 먼저 ast.literal_eval 시도, 실패하면 커스텀 함수 사용
            try:
                emb_i = ast.literal_eval(emb_i)
            except Exception:
                emb_i = convert_embedding_str_to_list(emb_i)

        if isinstance(emb_j, str):
            try:
                emb_j = ast.literal_eval(emb_j)
            except Exception:
                emb_j = convert_embedding_str_to_list(emb_j)

        label = self.labels[idx]
        # tensor 변환 (리스트 내부의 값들이 float여야 함)
        emb_i = torch.tensor(emb_i, dtype=torch.float)
        emb_j = torch.tensor(emb_j, dtype=torch.float)
        # CosineEmbeddingLoss는 positive면 1, negative면 -1 사용
        target = torch.tensor(1.0 if label == 1 else -1.0, dtype=torch.float)
        return emb_i, emb_j, target


dataset = BookPairsDataset(df, num_pairs=2000)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# --- 3. 모델 정의 및 학습 ---
class EmbeddingRefiner(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(EmbeddingRefiner, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# 원래 임베딩 차원 (예: df['embedding']가 numpy array이므로 길이를 구함)
input_dim = len(df.loc[0, 'embedding'])
output_dim = input_dim  # 차원 축소 혹은 확장을 원하면 변경 가능

model = EmbeddingRefiner(input_dim, output_dim)
model.train()

# Contrastive loss 역할을 하는 CosineEmbeddingLoss 사용
criterion = nn.CosineEmbeddingLoss(margin=0.5)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for emb_i, emb_j, target in dataloader:
        optimizer.zero_grad()
        out_i = model(emb_i)
        out_j = model(emb_j)
        loss = criterion(out_i, out_j, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * emb_i.size(0)
    epoch_loss /= len(dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# --- 4. 클러스터링 및 평가 ---
# 모든 책에 대한 refined embedding 계산
model.eval()
refined_embeddings = []
with torch.no_grad():
    for i in range(len(df)):
        emb = torch.tensor(df.loc[i, 'embedding'], dtype=torch.float).unsqueeze(0)
        refined = model(emb).squeeze(0).numpy()
        refined_embeddings.append(refined)

# 예시로 KMeans를 이용해 5개의 클러스터로 분할 (클러스터 수는 조정 가능)
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(refined_embeddings)

# 실루엣 점수 계산
sil_score = silhouette_score(refined_embeddings, cluster_labels)
print("Silhouette Score:", sil_score)


Epoch 1/10, Loss: 0.0010
Epoch 2/10, Loss: 0.0000
Epoch 3/10, Loss: 0.0000
Epoch 4/10, Loss: 0.0000
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0000
Epoch 7/10, Loss: 0.0000
Epoch 8/10, Loss: 0.0000
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0000
Silhouette Score: 0.46406475


In [2]:
# %% [markdown]
# ## Nonfiction Book Embedding 분석
# 1. 모듈 임포트 설정
# 2. JSON 데이터 로드
# 3. 임베딩 생성 및 검증

# %%
import sys
import os
import json
import numpy as np
from pathlib import Path

# 모듈 경로 설정 (Windows 경로 정규화)
module_path = os.path.abspath('app/BE/models')
if module_path not in sys.path:
    sys.path.insert(0, module_path)

# %%
try:
    from app.BE.models.book_embedding import generate_book_embedding
    print("✅ 모듈 임포트 성공!")
except ImportError as e:
    print(f"❌ 모듈 임포트 실패: {e}")
    raise

# %%
# JSON 데이터 로드
json_path = os.path.abspath('database/nonfiction_data.json')

try:
    with open(json_path, 'r', encoding='utf-8') as f:
        books = json.load(f)
    print(f"📚 {len(books)}권의 책 데이터 로드 완료")
except Exception as e:
    print(f"❗ JSON 파일 로드 오류: {e}")
    raise

# %%
# 첫 번째 책 처리 예시
sample_book = books[0]
print(f"\n🔍 분석 대상: {sample_book['book_name']}")

# 임베딩 생성
embedding_vector = generate_book_embedding(sample_book)

# 결과 출력
print(f"\n📐 임베딩 차원: {embedding_vector.shape}")
print(f"📊 처음 5개 값: {embedding_vector[:5].round(4)}")
print(f"📈 L2 Norm: {np.linalg.norm(embedding_vector):.4f}")

# %%
# 전체 책 처리 (옵션)
print("\n⏳ 전체 책 처리 시작...")
for idx, book in enumerate(books[:3]):  # 상위 3개만 시연
    emb = generate_book_embedding(book)
    print(f"\n{idx+1}. {book['book_name'][:30]}...")
    print(f"   → 리뷰 수: {len(book.get('reviews', []))}")
    print(f"   → 임베딩 Norm: {np.linalg.norm(emb):.4f}")
    
print("\n✅ 처리 완료!")


❌ 모듈 임포트 실패: No module named 'app'


ModuleNotFoundError: No module named 'app'