In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# SpaCy yükle
nlp = spacy.load("en_core_web_sm")

# Anahtar kelimeleri (noun, verb, adj) çıkar
def extract_keywords_spacy(text):
    doc = nlp(text)
    keywords = [token.text.lower() for token in doc if token.pos_ in ["NOUN", "VERB", "ADJ"] and not token.is_stop]
    return set(keywords)

# Keyword overlap oranı
def keyword_overlap_spacy(q, a):
    q_kw = extract_keywords_spacy(q)
    a_kw = extract_keywords_spacy(a)
    if len(q_kw.union(a_kw)) == 0:
        return 0.0
    return len(q_kw.intersection(a_kw)) / len(q_kw.union(a_kw))

# Veriyi yükle
text_df = pd.read_csv("wikiqa_test.csv").reset_index(drop=True)
embed_df = pd.read_csv("advanced_embedding_features_test.csv").reset_index(drop=True)

# Keyword overlap hesapla
print("🔁 SpaCy keyword overlap hesaplanıyor...")
text_df["keyword_overlap"] = text_df.apply(lambda row: keyword_overlap_spacy(row["question"], row["answer"]), axis=1)

# TF-IDF cosine similarity hesapla
print("🔁 TF-IDF cosine similarity hesaplanıyor...")
tfidf = TfidfVectorizer().fit(text_df["question"].tolist() + text_df["answer"].tolist())
q_vecs = tfidf.transform(text_df["question"])
a_vecs = tfidf.transform(text_df["answer"])
text_df["tfidf_cosine"] = [cosine_similarity(q, a)[0][0] for q, a in zip(q_vecs, a_vecs)]

# Özellikleri birleştir
final_df = pd.concat([embed_df, text_df[["keyword_overlap", "tfidf_cosine"]]], axis=1)

# CSV olarak kaydet
final_df.to_csv("final_feature_set_test.csv", index=False)
print("✅ final_feature_set_test.csv başarıyla oluşturuldu.")