In [5]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import math

try:
    import numpy
except ImportError:
    numpy = None

from sumy.summarizers._summarizer import AbstractSummarizer
from sumy._compat import Counter


class ConceptSummarizer(AbstractSummarizer):

    threshold = 0.1
    epsilon = 0.1
    _stop_words = frozenset()

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count):
        self._ensure_dependencies_installed()

        sentences_words = [self._to_words_set(s) for s in document.sentences]
        if not sentences_words:
            return tuple()

        tf_metrics = self._compute_tf(sentences_words)
        idf_metrics = self._compute_idf(sentences_words)

        matrix = self._create_matrix(sentences_words, self.threshold, tf_metrics, idf_metrics)
        scores = self.power_method(matrix, self.epsilon)
        ratings = dict(zip(document.sentences, scores))

        return self._get_best_sentences(document.sentences, sentences_count, ratings)

    @staticmethod
    def _ensure_dependencies_installed():
        if numpy is None:
            raise ValueError("Concept summarizer requires NumPy. Please, install it by command 'pip install numpy'.")

    def _to_words_set(self, sentence):
        words = map(self.normalize_word, sentence.words)
        return [self.stem_word(w) for w in words if w not in self._stop_words]

    def _compute_tf(self, sentences):
        tf_values = map(Counter, sentences)

        tf_metrics = []
        for sentence in tf_values:
            metrics = {}
            max_tf = self._find_tf_max(sentence)

            for term, tf in sentence.items():
                metrics[term] = tf / max_tf

            tf_metrics.append(metrics)

        return tf_metrics

    @staticmethod
    def _find_tf_max(terms):
        return max(terms.values()) if terms else 1

    @staticmethod
    def _compute_idf(sentences):
        idf_metrics = {}
        sentences_count = len(sentences)

        for sentence in sentences:
            for term in sentence:
                if term not in idf_metrics:
                    n_j = sum(1 for s in sentences if term in s)
                    idf_metrics[term] = math.log(sentences_count / (1 + n_j))

        return idf_metrics

    def _create_matrix(self, sentences, threshold, tf_metrics, idf_metrics):
        """
        Creates matrix of shape |sentences|×|sentences|.
        """
        # create matrix |sentences|×|sentences| filled with zeroes
        sentences_count = len(sentences)
        matrix = numpy.zeros((sentences_count, sentences_count))
        degrees = numpy.zeros((sentences_count, ))

        for row, (sentence1, tf1) in enumerate(zip(sentences, tf_metrics)):
            for col, (sentence2, tf2) in enumerate(zip(sentences, tf_metrics)):
                matrix[row, col] = self.compute_distance(sentence1, sentence2, tf1, tf2, idf_metrics)

                if matrix[row, col] > threshold:
                    matrix[row, col] = 1.0
                    degrees[row] += 1
                else:
                    matrix[row, col] = 0

        for row in range(sentences_count):
            for col in range(sentences_count):
                if degrees[row] == 0:
                    degrees[row] = 1

                matrix[row][col] = matrix[row][col] / degrees[row]

        return matrix

    #@staticmethod

    @staticmethod
    def power_method(matrix, epsilon):
        transposed_matrix = matrix.T
        sentences_count = len(matrix)
        p_vector = numpy.array([1.0 / sentences_count] * sentences_count)
        lambda_val = 1.0

        while lambda_val > epsilon:
            next_p = numpy.dot(transposed_matrix, p_vector)
            lambda_val = numpy.linalg.norm(numpy.subtract(next_p, p_vector))
            p_vector = next_p

        return p_vector

    @staticmethod
    def compute_distance(sentence1, sentence2, tf1, tf2, idf_metrics):   
        EPSILON = 0.0000000000000001
        result = 0

        # identify common words
        common_words = frozenset(sentence1) & frozenset(sentence2)

        if len(sentence1) > len(sentence2): 
            maxLen = len(sentence1); 
            minLen = len(sentence2) 
        else: 
            maxLen = len(sentence2); 
            minLen = len(sentence1) 

        # calculates similarity
        wordWeightMax = 0; wordWeightMin = 0;
        for term in common_words:
            if wordWeightMax < len(term): wordWeightMax = len(term)
            if wordWeightMin > len(term): wordWeightMin = len(term)
            negationWordWeightMax = 1 - wordWeightMax;
            negationWordWeightMin = 1 - wordWeightMin;

            c1 = 1 if wordWeightMin == 0 else wordWeightMax / wordWeightMin;
            c2 = 1 if wordWeightMax == 0 else wordWeightMin / wordWeightMax;
            c3 = 1 if negationWordWeightMin == 0 else negationWordWeightMax / negationWordWeightMin;
            c4 = 1 if negationWordWeightMax == 0 else negationWordWeightMin / negationWordWeightMax;

            m1 = min(min(c1, c2), 1);
            m2 = min(min(c3, c4), 1);

            result += 0.5*(m1+m2);

        result = math.fabs(result / (minLen + maxLen - len(common_words) + EPSILON));
        return result;

    

In [7]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
# line = open('output/experiments/C1_Mundo_AviaoCongo/sumario_automatico_CSTSumm.txt').read().decode('utf-8') 
line = open('experimentos/01/C1_sumario_automatico_CSTSumm.txt').read().decode('utf-8') 
SENTENCES_COUNT = line.count('.')
print(SENTENCES_COUNT)

# parser = PlaintextParser.from_file("output/experiments/C1_Mundo_AviaoCongo/original.txt", Tokenizer(LANGUAGE))
parser = PlaintextParser.from_file('experimentos/01/C1_original.txt', Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

summarizer = ConceptSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

3
Segundo uma porta-voz da ONU, o avião, de fabricação russa, estava tentando aterrissar no aeroporto de Bukavu em meio a uma tempestade.
Um acidente aéreo na localidade de Bukavu, no leste da República Democrática do Congo (RDC), matou 17 pessoas na quinta-feira à tarde, informou nesta sexta-feira um porta-voz das Nações Unidas.
Um acidente aéreo na localidade de Bukavu, no leste da República Democrática do Congo, matou 17 pessoas na quinta-feira à tarde, informou hoje um porta-voz das Nações Unidas.


In [68]:
# import os, fnmatch
# from sumy.parsers.plaintext import PlaintextParser
# from sumy.nlp.tokenizers import Tokenizer
# from sumy.nlp.stemmers import Stemmer
# from sumy.utils import get_stop_words

# LANGUAGE = "english"
# rootdir = 'output'

# for subdirs, dirs, files in os.walk(rootdir):
#   for dir in dirs:
#     for file in os.listdir(os.path.join(subdirs, dir)):
#       if fnmatch.fnmatch(file, '*original.txt'): 
#         line = open(os.path.join(subdirs, dir, 'sumario_automatico_CSTSumm.txt')).read().decode('latin-1')
#         SENTENCES_COUNT = line.count('.')
#         parser = PlaintextParser.from_file(os.path.join(subdirs, dir, 'original.txt'), Tokenizer(LANGUAGE))
#         stemmer = Stemmer(LANGUAGE)
#         summarizer = ConceptSummarizer(stemmer)
#         summarizer.stop_words = get_stop_words(LANGUAGE)
#         f = open(os.path.join(subdirs, dir, 'concept_summ.txt'), 'w+')
#         for sentence in summarizer(parser.document, SENTENCES_COUNT):
#           f.write(str(sentence))