In [26]:
import json
import faiss
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

In [27]:
embedding_json_path = "./embedding/emb_v1.json"
input_csv_path = "./test_dataset/own_dataset_trans_v1.csv"  
output_csv_path = "./result/result_v3.csv"  
model_name = "BAAI/bge-m3"

query_data = pd.read_csv(input_csv_path)

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [29]:
with open(embedding_json_path, "r", encoding="utf-8") as embedding_file:
    embedded_data = json.load(embedding_file)

embeddings = np.array([item["embedding"] for item in embedded_data]).astype("float32")
metadata = [item for item in embedded_data]

In [30]:
index = faiss.IndexFlatL2(embeddings.shape[1])  
index.add(embeddings)

In [31]:
def get_query_embedding(query, tokenizer, model, device):
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :]  
    return embedding.squeeze(0).cpu().numpy()

In [32]:
def search_similar_faiss(query_embedding, index, metadata, top_k=5):
    query_embedding = query_embedding.reshape(1, -1).astype("float32")
    distances, indices = index.search(query_embedding, top_k)  # 검색 실행
    results = [
        {"item": metadata[idx], "distance": distances[0][i]}
        for i, idx in enumerate(indices[0])
    ]
    return results

In [35]:
results = []

for _, row in query_data.iterrows():
    query = row['query']
    query_embedding = get_query_embedding(query, tokenizer, model, device)
    top_results = search_similar_faiss(query_embedding, index, metadata, top_k=5)

    results.append({
        "original_query": query,
        "texts": [result['item']["description"] for result in top_results],
        "video_ids": [result['item']["video_id"] for result in top_results],
        "starts": [result['item']["start"] for result in top_results],
        "ends": [result['item']["end"] for result in top_results],
    })

results_df = pd.DataFrame(results)

results_df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")