In [None]:
import os
import numpy as np
import spacy
from collections import Counter, OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_lg')

class TextRank4Keyword():
    def __init__(self):
        self.d = 0.85
        self.min_diff = 1e-5
        self.steps = 10
        self.node_weight = None

    def set_stopwords(self, stopwords):  
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                if token.pos_ in candidate_pos and not token.is_stop:
                    selected_words.append(token.text.lower() if lower else token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        token_pairs = []
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
        g = self.symmetrize(g)
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0)
        return g_norm

    def analyze(self, text, candidate_pos=['NOUN', 'PROPN'], window_size=4, lower=True, stopwords=[]):
        self.set_stopwords(stopwords)
        doc = nlp(text)
        sentences = self.sentence_segment(doc, candidate_pos, lower)
        vocab = self.get_vocab(sentences)
        token_pairs = self.get_token_pairs(window_size, sentences)
        g = self.get_matrix(vocab, token_pairs)
        pr = np.array([1] * len(vocab))
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1 - self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < self.min_diff:
                break
            previous_pr = sum(pr)
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        self.node_weight = node_weight

def generate_ngrams(words, n):
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def load_ground_truth_keywords(key_path):
    keywords = []
    with open(key_path, 'r', encoding='utf-8') as f:
        for line in f:
            keywords.append(line.strip().lower())
    return keywords

def evaluate_keywords(predicted, ground_truth):
    predicted_set = set(predicted)
    ground_truth_set = set(ground_truth)

    true_positive = predicted_set.intersection(ground_truth_set)
    precision = len(true_positive) / len(predicted_set) if predicted_set else 0
    recall = len(true_positive) / len(ground_truth_set) if ground_truth_set else 0
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1

def evaluate_all_files(text_folder='docsutf8', key_folder='keys', top_n=10):
    scores = []
    first_file_done = False

    for filename in sorted(os.listdir(text_folder)):
        if filename.endswith('.txt'):
            base_name = os.path.splitext(filename)[0]
            text_path = os.path.join(text_folder, filename)
            key_path = os.path.join(key_folder, base_name + '.key')

            

            with open(text_path, 'r', encoding='utf-8') as f:
                text = f.read()

            tr4w = TextRank4Keyword()
            tr4w.analyze(text)

            keywords_1gram = list(tr4w.node_weight.keys())
            keywords_2gram = generate_ngrams(keywords_1gram, 2)
            keywords_3gram = generate_ngrams(keywords_1gram, 3)

            all_keywords = Counter()
            for kw in keywords_1gram:
                all_keywords[kw] += tr4w.node_weight.get(kw, 1)
            for kw in keywords_2gram:
                all_keywords[kw] += sum([tr4w.node_weight.get(w, 1) for w in kw.split()]) / 2
            for kw in keywords_3gram:
                all_keywords[kw] += sum([tr4w.node_weight.get(w, 1) for w in kw.split()]) / 3

            predicted_keywords = [kw for kw, _ in all_keywords.most_common(top_n)]
            ground_truth = load_ground_truth_keywords(key_path)

           
            if not first_file_done:
                print(f"\n İlk dosya: {filename}")
                print("\n Tahmin Edilen Anahtar Kelimeler:")
                for kw in predicted_keywords:
                    print(f"  - {kw}")
                print("\n Gerçek Anahtar Kelimeler:")
                for kw in ground_truth:
                    print(f"  - {kw}")
                first_file_done = True

            precision, recall, f1 = evaluate_keywords(predicted_keywords, ground_truth)
            scores.append((precision, recall, f1))
            print(f"\n📄 {filename} -> Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

    if scores:
        avg_precision = sum([s[0] for s in scores]) / len(scores)
        avg_recall = sum([s[1] for s in scores]) / len(scores)
        avg_f1 = sum([s[2] for s in scores]) / len(scores)
        print(f"\n Ortalama Sonuçlar -> Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1: {avg_f1:.2f}")
    else:
        print("Hiç uygun eşleşme bulunamadı.")


evaluate_all_files('docsutf8', 'keys', top_n=10)
