In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from tqdm import tqdm

# 1. Ham WikiQA verisini ve question_id’yi yükle
df = pd.read_csv("wikiqa_train.csv")
# df sütunları: ['question_id', 'question', 'document_title', 'answer', 'label']

# 2. İndex’i sıfırla ve sample alın (istersen tüm veri de olabilir)
sample_df = df.reset_index(drop=True)  # tüm satırlar

# 3. Sentence-BERT modelini yükle
model = SentenceTransformer("all-MiniLM-L6-v2")

# 4. Metinleri listele
questions = sample_df["question"].tolist()
answers   = sample_df["answer"].tolist()

# 5. Embedding’leri hesapla
print("Embedding hesaplanıyor...")
q_embs = model.encode(questions, convert_to_numpy=True, show_progress_bar=True)
a_embs = model.encode(answers,   convert_to_numpy=True, show_progress_bar=True)

# 6. Feature’ları hesapla
print("Özellikler hesaplanıyor...")
cosine_sims    = []
euclid_dists   = []
dot_prods      = []
l1_diffs       = []
length_diffs   = []
mean_diffs     = []
max_diffs      = []

for q_vec, a_vec, q_txt, a_txt in tqdm(zip(q_embs, a_embs, questions, answers), total=len(questions)):
    cosine_sims.append(   cosine_similarity([q_vec], [a_vec])[0][0] )
    euclid_dists.append(  euclidean(q_vec, a_vec) )
    dot_prods.append(     np.dot(q_vec, a_vec) )
    l1_diffs.append(      np.sum(np.abs(q_vec - a_vec)) )
    length_diffs.append(  abs(len(q_txt) - len(a_txt)) )
    mean_diffs.append(    abs(np.mean(q_vec) - np.mean(a_vec)) )
    max_diffs.append(     abs(np.max(q_vec)  - np.max(a_vec)) )

# 7. question_id ve tüm feature’ları bir araya getir
features_df = pd.DataFrame({
    "question_id":         sample_df["question_id"],
    "cosine_similarity":   cosine_sims,
    "euclidean_distance":  euclid_dists,
    "dot_product":         dot_prods,
    "l1_difference":       l1_diffs,
    "length_difference":   length_diffs,
    "mean_difference":     mean_diffs,
    "max_difference":      max_diffs,
    "label":               sample_df["label"]
})

# 8. Test/Train için ayrı isimle kaydet
features_df.to_csv("advanced_embedding_features_train.csv", index=False)
print("✅ advanced_embedding_features_train.csv başarıyla oluşturuldu.")
