In [None]:

#  GEREKLİ KÜTÜPHANELER

import re, string
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("en_core_web_sm")


#  SABİTLER

DOCS_PATH      = Path("docsutf8")     # .txt dosyaları
KEYS_PATH      = Path("keys")         # .key dosyaları
STOPWORD_PATH  = Path("stopwords.txt")
TOP_K_KEYWORDS = 10
PUNCTUATION    = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""


#  YARDIMCI FONKSİYONLAR

def get_stopwords_list(path: Path):
    with open(path, encoding="utf-8") as f:
        return [w.strip() for w in f if w.strip()]


def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def clean_text(text: str) -> str:
    text = text.lower()
    text = "".join(c for c in text if c not in PUNCTUATION)
    text = re.sub(r"\s+", " ", text).strip()
    return lemmatize_text(text)

def read_key_file(path: Path):
    with open(path, encoding="utf-8") as f:
        keys = [clean_text(line) for line in f]
    return list({k for k in keys if k})

def sort_coo(coo_matrix):
    return sorted(zip(coo_matrix.col, coo_matrix.data),
                  key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
    return [feature_names[idx] for idx, _ in sorted_items]

def get_keywords(vectorizer, feature_names, doc):
    tf_idf_vector = vectorizer.transform([doc])
    sorted_items  = sort_coo(tf_idf_vector.tocoo())
    return extract_topn_from_vector(feature_names, sorted_items, TOP_K_KEYWORDS)


#  DÖKÜMANLARI OKU

file_paths = list(DOCS_PATH.glob("*.txt"))
if not file_paths:
    raise FileNotFoundError(f"{DOCS_PATH} içinde .txt dosyası yok!")

corpora, basenames = [], []
for fp in file_paths:
    with open(fp, encoding="utf-8") as f:
        corpora.append(clean_text(f.read()))
        basenames.append(fp.stem)


#  TF-IDF & ANAHTAR SÖZCÜK TAHMİNİ

stopwords  = get_stopwords_list(STOPWORD_PATH)

vectorizer = TfidfVectorizer(
    stop_words=stopwords,
    smooth_idf=True,
    use_idf=True,
    ngram_range=(1, 3)          
)

vectorizer.fit_transform(corpora)
feature_names = vectorizer.get_feature_names_out()

predictions = [
    get_keywords(vectorizer, feature_names, doc)
    for doc in corpora
]


#  GERÇEK .key DOSYALARINI OKU

ground_truth = []
for base in basenames:
    key_fp = KEYS_PATH / f"{base}.key"
    if not key_fp.exists():
        raise FileNotFoundError(f"{key_fp} bulunamadı!")
    ground_truth.append(read_key_file(key_fp))    # ❷




rows = []
for fname, pred, true in zip(basenames, predictions, ground_truth):
    pred_set = set(pred[:10])   
    true_set = set(true)
    intersect = pred_set & true_set

    recall_at_10 = len(intersect) / len(true_set) if true_set else 0
    precision_at_10 = len(intersect) / 10

    rows.append({
        "dosya": fname,
        "true_keywords": len(true_set),
        "doğru": len(intersect),
        "precision@10": round(precision_at_10, 3),
        "recall@10": round(recall_at_10, 3),
        "→ tahmin": ", ".join(pred[:10]),
        "→ gerçek": ", ".join(true),
    })

df_eval = pd.DataFrame(rows)


df_eval.to_csv("sonuc.csv", index=False, encoding="utf-8-sig")


display(df_eval)

print("\ ORTALAMA:")
for m in ["precision", "recall"]:
    print(f"  {m:12}: {df_eval[m].mean():.3f}")




Unnamed: 0,dosya,true_keywords,doğru,precision@10,recall@10,→ tahmin,→ gerçek
0,C-1,19,4,0.4,0.211,"uddi, registry, dht, service, uddi registry, p...","bamboo dht code, uddi registry, query, dht bas..."
1,C-14,14,3,0.3,0.214,"sensor, exposure, deploy, minimum exposure, se...","path exposure, number of sensor, value fusion,..."
2,C-17,16,1,0.1,0.062,"nmax, packet, client, css, stream, sip, audio,...","packetswitche network, audio service framework..."
3,C-18,14,2,0.2,0.143,"worm, swarm, password, swarm worm, host, zachi...","countermeasure system, internet worm, emergent..."
4,C-19,13,5,0.5,0.385,"protocol module, module, protocol, service int...","dynamic protocol replacement, service interfac..."
...,...,...,...,...,...,...,...
238,J-72,19,2,0.2,0.105,"valuation, elicitation, query, equivalence que...","combinatorial auction, xor bid, polynomial com..."
239,J-73,14,2,0.2,0.143,"vwap, limit order, price, sell, competitive ra...","online model, stock trading, volume weight ave..."
240,J-74,12,4,0.4,0.333,"θi, biθi, auction, bid, fθi, cheat, agent, sel...","bidsecondprice auction, sealedbid, auction, se..."
241,J-8,23,2,0.2,0.087,"player, connection game, strong equilibrium, g...","player number, graph topology, fair connection..."



MAKRO ORTALAMA:
  precision@10: 0.166
  recall@10   : 0.114
