In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.preprocessing import normalize

# 1️⃣ 데이터 로드
df = pd.read_excel("kurly.xlsx")

# 필요한 데이터만 사용
data = df[['브랜드', '상품이름', 'URL', '가격', '리뷰수', '태깅', '키워드']].copy()
data['청크'] = data['키워드']

In [None]:
# 2️⃣ TF-IDF 인코딩 생성
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['청크'])

In [None]:
# 3️⃣ BM25 준비
bm25 = BM25Okapi([chunk.split() for chunk in data['청크']])

In [None]:
# 4️⃣ Semantic Embedding 준비
embedding_model = SentenceTransformer("BAAI/bge-m3")
data['embedding'] = data['청크'].apply(lambda x: embedding_model.encode(x))

In [None]:
# 5️⃣ FAISS 인덱스 생성
d = len(data['embedding'][0])
faiss_index = faiss.IndexFlatL2(d)
faiss_index.add(np.vstack(data['embedding'].values))

In [None]:
# 6️⃣ 상품 추천 함수
def search_products(query, top_k=5):
    # (1) TF-IDF 기반 검색
    tfidf_query_vec = tfidf_vectorizer.transform([query])
    tfidf_scores = (tfidf_matrix @ tfidf_query_vec.T).toarray().flatten()  

    # (2) BM25 기반 검색
    bm25_scores = bm25.get_scores(query.split())

    # (3) Semantic Embedding 기반 검색
    query_embedding = embedding_model.encode(query)
    query_embedding = normalize(query_embedding.reshape(1, -1), axis=1)  
    _, semantic_indices = faiss_index.search(query_embedding, len(data))
    semantic_scores = np.zeros(len(data))
    for rank, idx in enumerate(semantic_indices[0]):
        semantic_scores[idx] = len(data) - rank 

    # (4) Rank Fusion
    combined_scores = tfidf_scores + bm25_scores + semantic_scores 
    ranked_indices = np.argsort(combined_scores)[::-1]

    # 상위 K개의 결과 반환
    results = data.iloc[ranked_indices[:top_k]].copy()
    results['점수'] = combined_scores[ranked_indices[:top_k]]
    return results[['브랜드', '상품이름', 'URL', '가격', '리뷰수', '태깅', '점수']]

In [None]:
# 7️⃣ 상품 추천 결과 보기
query = "간식으로 먹을 수 있는 든든한 샌드위치 추천해줘"
recommendations = search_products(query)
recommendations