In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_model = "ytu-ce-cosmos/turkish-e5-large"
device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model = SentenceTransformer(embedding_model, device=device)

In [3]:
csv_file = "soru_cevap_dataset_500.csv"

if not os.path.exists(csv_file):
    print(f"HATA: '{csv_file}' dosyası bulunamadı.")
    print("Lütfen önce soru üretim kodunu (generate_batch_questions.py) çalıştırın.")
    exit()

print(f"Veri seti yükleniyor: {csv_file}")
df = pd.read_csv(csv_file)
print(f"Toplam {len(df)} kayıt işlenecek.")

Veri seti yükleniyor: soru_cevap_dataset_500.csv
Toplam 500 kayıt işlenecek.


In [4]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

task = 'Given a Turkish search query, retrieve relevant passages written in Turkish that best answer the query'
formatted_questions = [get_detailed_instruct(task, q) for q in df['Soru']]

In [5]:
question_embeddings = embedding_model.encode(
    formatted_questions, 
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32
)

good_answer_embeddings = embedding_model.encode(
    df['Iyi_Cevap'].tolist(),
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32
)

bad_answer_embeddings = embedding_model.encode(
    df['Kotu_Cevap'].tolist(),
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32
)


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches: 100%|██████████| 16/16 [00:02<00:00,  5.44it/s]
Batches: 100%|██████████| 16/16 [00:03<00:00,  4.85it/s]
Batches: 100%|██████████| 16/16 [00:01<00:00, 11.24it/s]


In [6]:

df['Soru_Embedding'] = list(question_embeddings)
df['Iyi_Cevap_Embedding'] = list(good_answer_embeddings)
df['Kotu_Cevap_Embedding'] = list(bad_answer_embeddings)

output_file = "soru_cevap_embeddings.pkl"
df.to_pickle(output_file)