In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CSV_FILE = "soru_cevap_dataset_500.csv"
OUTPUT_FILE = "e5_large_embeddings.pkl"
TASK = 'Given a Turkish search query, retrieve relevant passages written in Turkish that best answer the query'
BATCH_SIZE = 32

In [None]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

In [None]:


print(f"Veri seti yükleniyor: {CSV_FILE}")
df = pd.read_csv(CSV_FILE)
print(f"Toplam {len(df)} kayıt işlenecek. Kullanılacak cihaz: {DEVICE}")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
formatted_questions = [get_detailed_instruct(TASK, q) for q in df['Soru']]

question_embeddings = embedding_model.encode(
    formatted_questions, 
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
df['Soru_Embedding_E5Large'] = list(question_embeddings)

good_answer_embeddings = embedding_model.encode(
    df['Iyi_Cevap'].tolist(),
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
df['Iyi_Cevap_Embedding_E5Large'] = list(good_answer_embeddings)
bad_answer_embeddings = embedding_model.encode(
    df['Kotu_Cevap'].tolist(),
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=BATCH_SIZE
)
df['Kotu_Cevap_Embedding_E5Large'] = list(bad_answer_embeddings)


df.to_pickle(OUTPUT_FILE)
print(f"Başarıyla kaydedildi. Çıktı dosyası: {OUTPUT_FILE}")
