In [None]:
import os
import numpy as np
import spacy
from collections import Counter, OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_lg')

class TextRank4KeywordGlobal():
    def __init__(self):
        self.d = 0.85
        self.min_diff = 1e-5
        self.steps = 10
        self.node_weight = None

    def set_stopwords(self, stopwords):  
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                if token.pos_ in candidate_pos and not token.is_stop:
                    selected_words.append(token.text.lower() if lower else token.text)
            if selected_words:
                sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        token_pairs = []
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
        g = self.symmetrize(g)
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm != 0)
        return g_norm

    def analyze(self, sentences, window_size=4):
        vocab = self.get_vocab(sentences)
        token_pairs = self.get_token_pairs(window_size, sentences)
        g = self.get_matrix(vocab, token_pairs)
        pr = np.array([1] * len(vocab))
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1 - self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < self.min_diff:
                break
            previous_pr = sum(pr)
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        self.node_weight = node_weight


def global_textrank_local_keywords(text_folder='docsutf8', top_n=10):
    all_sentences = []
    file_sentences = {}

    
    for filename in sorted(os.listdir(text_folder)):
        if filename.endswith(".txt"):
            with open(os.path.join(text_folder, filename), 'r', encoding='utf-8') as f:
                text = f.read()
                doc = nlp(text)
                tr = TextRank4KeywordGlobal()
                tr.set_stopwords([])
                sents = tr.sentence_segment(doc, candidate_pos=['NOUN', 'PROPN'], lower=True)
                file_sentences[filename] = sents
                all_sentences.extend(sents)

    
    print("🔄 Global TextRank modeli oluşturuluyor...")
    global_tr = TextRank4KeywordGlobal()
    global_tr.set_stopwords([])
    global_tr.analyze(all_sentences)

    
    for filename, sents in file_sentences.items():
        words = [word for sent in sents for word in sent]
        local_words = set(words)
        ranked_keywords = [(word, global_tr.node_weight[word]) for word in local_words if word in global_tr.node_weight]
        ranked_keywords = sorted(ranked_keywords, key=lambda x: x[1], reverse=True)[:top_n]
        
        print(f"\n📄 {filename} için anahtar kelimeler:")
        for word, score in ranked_keywords:
            print(f"  - {word} ({score:.4f})")


global_textrank_local_keywords('docsutf8', top_n=10)


Processing file: docsutf8\C-1.txt
Top 10 phrases (max 3 words, highest ranked):
UDDI registry
query service registry
multiple UDDI registry
service proxy registry
network UDDI registry
federated UDDI service
UDDI DHT
similar UDDI registry
UDDI V3 registry
private UDDI registry
----------------------------------------
Processing file: docsutf8\C-14.txt
Top 10 phrases (max 3 words, highest ranked):
deploy sensor
sensor high exposure
cost sensor
number sensor
deploy sensor time
sensor
increase number sensor
energy sensor
optimal number sensor
vary number sensor
----------------------------------------
Processing file: docsutf8\C-17.txt
Top 10 phrases (max 3 words, highest ranked):
client conference
packet client
domain conference
CS conference
stream client
audio conference time
NMax audio packet
conferencing conference
packet remote domain
number active client
----------------------------------------
Processing file: docsutf8\C-18.txt


KeyboardInterrupt: 