LSA

In [None]:
import math
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from numpy.linalg import svd as singular_value_decomposition
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import docx
import re

# Descargar recursos necesarios si no están ya descargados
nltk.download('punkt')
nltk.download('stopwords')

class LsaSummarizer:
    MIN_DIMENSIONS = 3
    REDUCTION_RATIO = 1 / 1

    def __init__(self):
        self._stop_words = list(stopwords.words('english'))

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = words

    def __call__(self, document_text, sentences_count, important_words):
        dictionary = self._create_dictionary(document_text)
        if not dictionary:
            return []

        sentences = sent_tokenize(document_text)

        matrix = self._create_matrix(document_text, dictionary, important_words)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        return self._get_best_sentences(sentences, sentences_count, important_words, lambda s: next(ranks))

    def _create_dictionary(self, document):
        words = word_tokenize(document)
        words = map(self.normalize_word, words)
        unique_words = frozenset(w for w in words if w not in self._stop_words and w)
        return dict((w, i) for i, w in enumerate(unique_words))

    def _create_matrix(self, document, dictionary, important_words):
        sentences = sent_tokenize(document)
        words_count = len(dictionary)
        sentences_count = len(sentences)
        matrix = np.zeros((words_count, sentences_count))
        for col, sentence in enumerate(sentences):
            words = word_tokenize(sentence)
            for word in words:
                normalized = self.normalize_word(word)
                if normalized in dictionary:
                    row = dictionary[normalized]
                    matrix[row, col] += 1
                    if normalized in important_words:
                        matrix[row, col] *= 2
        return matrix

    def _compute_term_frequency(self, matrix, smooth=0.4):
        max_word_frequencies = np.max(matrix, axis=0)
        rows, cols = matrix.shape
        for row in range(rows):
            for col in range(cols):
                max_word_frequency = max_word_frequencies[col]
                if max_word_frequency != 0:
                    frequency = matrix[row, col] / max_word_frequency
                    matrix[row, col] = smooth + (1.0 - smooth) * frequency
        return matrix

    def _compute_ranks(self, sigma, v_matrix):
        dimensions = max(self.MIN_DIMENSIONS, int(len(sigma) * self.REDUCTION_RATIO))
        powered_sigma = [s**2 if i < dimensions else 0.0 for i, s in enumerate(sigma)]

        ranks = []
        for column_vector in v_matrix.T:
            rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector))
            ranks.append(math.sqrt(rank))
        return ranks
 

    def _get_best_sentences(self, sentences, sentences_count, important_words, rank_fn):
        ranked_sentences = [(rank_fn(sentence), sentence) for sentence in sentences]
        ranked_sentences.sort(reverse=True, key=lambda x: x[0])

        # Identificar las oraciones que contienen las palabras relevantes
        relevant_sentences = [sentence for sentence in sentences if any(word in sentence for word in important_words)]

        # Tomar las oraciones relevantes primero
        selected_sentences = relevant_sentences[:sentences_count]

        # Si necesitamos más oraciones, añadir las de mayor puntaje
        additional_sentences_needed = sentences_count - len(selected_sentences)
        if additional_sentences_needed > 0:
            remaining_sentences = [sentence for _, sentence in ranked_sentences if sentence not in selected_sentences]
            selected_sentences.extend(remaining_sentences[:additional_sentences_needed])

        return selected_sentences

    def normalize_word(self, word):
        return ''.join(e for e in word if e.isalnum()).lower()
    
    def normalize_word(self, word):
        return ''.join(e for e in word if e.isalnum()).lower()

# Función para leer .docx y retornar texto
def read_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

# Función principal
if __name__ == '__main__':
    ruta_docx = "Case Study 3.docx"
    text = read_docx_text(ruta_docx)

    keywords = ["need", "want", "have", "has", "should", "require", "demand", "must"]
    lower_keywords = [k.lower() for k in keywords]

    # Contar oraciones con palabras clave
    all_sentences = sent_tokenize(text)
    count = 0
    for s in all_sentences:
        words = [w.strip(string.punctuation).lower() for w in word_tokenize(s)]
        if any(k in words for k in lower_keywords):
            count += 1

    print(f"Se encontraron {count} oraciones que contienen alguna de las palabras clave: {keywords}")

    # Generar resumen
    summarizer = LsaSummarizer()
    resumen = summarizer(text, count, important_words=lower_keywords)

summary_textLsaMod = ' '.join(resumen)
print(summary_textLsaMod)



LexRank

In [None]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import math
from collections import Counter, namedtuple
from operator import attrgetter

try:
    import numpy
except ImportError:
    numpy = None

def null_stemmer(word):
    return word

def to_unicode(text, encoding='utf-8', errors='strict'):
    if isinstance(text, bytes):
        return text.decode(encoding, errors=errors)
    return text

class ItemsCount(object):
    def __init__(self, count):
        self.count = count

    def __call__(self, seq):
        return seq[:self.count]

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))

class AbstractSummarizer(object):
    def __init__(self, stemmer=null_stemmer):
        if not callable(stemmer):
            raise ValueError("Stemmer has to be a callable object")
        self._stemmer = stemmer

    def __call__(self, document, sentences_count):
        raise NotImplementedError("This method should be overriden in subclass")

    def stem_word(self, word):
        return self._stemmer(self.normalize_word(word))

    @staticmethod
    def normalize_word(word):
        return to_unicode(word).lower()

    @staticmethod
    def _get_best_sentences(sentences, count, rating, *args, **kwargs):
        rate = rating
        if isinstance(rating, dict):
            assert not args and not kwargs
            def rate(s): return rating[s]

        infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
                 for o, s in enumerate(sentences))
        infos = sorted(infos, key=attrgetter("rating"), reverse=True)
        if not callable(count):
            count = ItemsCount(count)
        infos = count(infos)
        infos = sorted(infos, key=attrgetter("order"))
        return tuple(i.sentence for i in infos)

class LexRankSummarizer(AbstractSummarizer):
    threshold = 0.1
    epsilon = 0.01
    _stop_words = frozenset()
    _important_words = {"need", "want", "have", "has", "should", "require", "demand", "must"}
   
    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count):
        self._ensure_dependencies_installed()

        sentences_words = [self._to_words_set(s) for s in document.sentences]
        if not sentences_words:
            return tuple()

        tf_metrics = self._compute_tf(sentences_words)
        idf_metrics = self._compute_idf(sentences_words)

        matrix = self._create_matrix(sentences_words, self.threshold, tf_metrics, idf_metrics)
        scores = self.power_method(matrix, self.epsilon)
        ratings = dict(zip(document.sentences, scores))

        return self._get_best_sentences(document.sentences, sentences_count, ratings)

    @staticmethod
    def _ensure_dependencies_installed():
        if numpy is None:
            raise ValueError("LexRank summarizer requires NumPy. Please, install it by command 'pip install numpy'.")

    def _to_words_set(self, sentence):
        words = map(self.normalize_word, sentence.words)
        return [self.stem_word(w) for w in words if w not in self._stop_words]

    def _compute_tf(self, sentences):
        tf_values = map(Counter, sentences)
        tf_metrics = []

        for sentence in tf_values:
            metrics = {}
            max_tf = self._find_tf_max(sentence)

            for term, tf in sentence.items():
                weight = 2.0 if term in self._important_words else 1.0  # Peso adicional
                metrics[term] = (tf * weight) / max_tf

            tf_metrics.append(metrics)

        return tf_metrics

    @staticmethod
    def _find_tf_max(terms):
        return max(terms.values()) if terms else 1

    def _compute_idf(self, sentences):
        idf_metrics = {}
        sentences_count = len(sentences)

        for sentence in sentences:
            for term in sentence:
                if term not in idf_metrics:
                    n_j = sum(1 for s in sentences if term in s)
                    boost = 1.5 if term in self._important_words else 1.0
                    idf_metrics[term] = boost * math.log(sentences_count / (1 + n_j))

        return idf_metrics

    def _create_matrix(self, sentences, threshold, tf_metrics, idf_metrics):
        sentences_count = len(sentences)
        matrix = numpy.zeros((sentences_count, sentences_count))
        degrees = numpy.zeros((sentences_count,))

        for row, (sentence1, tf1) in enumerate(zip(sentences, tf_metrics)):
            for col, (sentence2, tf2) in enumerate(zip(sentences, tf_metrics)):
                similarity = self.cosine_similarity(sentence1, sentence2, tf1, tf2, idf_metrics)
                if similarity > threshold:
                    matrix[row, col] = 1.0
                    degrees[row] += 1
                else:
                    matrix[row, col] = 0

        for row in range(sentences_count):
            for col in range(sentences_count):
                if degrees[row] == 0:
                    degrees[row] = 1
                matrix[row][col] = matrix[row][col] / degrees[row]

        return matrix

    @staticmethod
    def cosine_similarity(sentence1, sentence2, tf1, tf2, idf_metrics):
        unique_words1 = frozenset(sentence1)
        unique_words2 = frozenset(sentence2)
        common_words = unique_words1 & unique_words2

        numerator = 0.0
        for term in common_words:
            numerator += tf1[term] * tf2[term] * idf_metrics[term] ** 2

        denominator1 = sum((tf1[t] * idf_metrics[t]) ** 2 for t in unique_words1)
        denominator2 = sum((tf2[t] * idf_metrics[t]) ** 2 for t in unique_words2)

        if denominator1 > 0 and denominator2 > 0:
            return numerator / (math.sqrt(denominator1) * math.sqrt(denominator2))
        else:
            return 0.0

    @staticmethod
    def power_method(matrix, epsilon):
        transposed_matrix = matrix.T
        sentences_count = len(matrix)
        p_vector = numpy.array([1.0 / sentences_count] * sentences_count)
        lambda_val = 1.0

        while lambda_val > epsilon:
            next_p = numpy.dot(transposed_matrix, p_vector)
            next_p /= numpy.linalg.norm(next_p)
            lambda_val = numpy.linalg.norm(numpy.subtract(next_p, p_vector))
            p_vector = next_p

        return p_vector


Edmunson

In [None]:
class EdmundsonSummarizer(AbstractSummarizer):
    def __init__(self, stemmer=None):
        super().__init__(stemmer=stemmer)
        self._bonus_words = []
        self._stigma_words = []
        self._null_words = []
        self._keywords = []
        self._cue_method = EdmundsonCueMethod()
        self._key_method = EdmundsonKeyMethod()
        self._title_method = EdmundsonTitleMethod()
        self._location_method = EdmundsonLocationMethod()

 

    def __call__(self, document, sentences_count=5):
        self._cue_method.set_bonus_words(self._bonus_words)
        self._cue_method.set_stigma_words(self._stigma_words)
        self._cue_method.set_null_words(self._null_words)
        self._key_method.set_keywords(self._keywords)
        self._title_method.set_headings(document.headings)

        rated_sentences = []
        for sentence in document.sentences:
            rating = (
                1.0 * self._cue_method(sentence)
                + 4.0 * self._key_method(sentence)  # <-- más peso a keywords
                + 1.0 * self._title_method(sentence)
                + 1.0 * self._location_method(sentence)
            )
            rated_sentences.append((rating, sentence))
        rated_sentences.sort(key=lambda x: x[0], reverse=True)
        return [s for _, s in rated_sentences[:sentences_count]]


    @property
    def bonus_words(self):
        return self._bonus_words

    @bonus_words.setter
    def bonus_words(self, words):
        self._bonus_words = ffilter(lambda w: w not in self._stop_words, map(self.stemmer, words))

    @property
    def stigma_words(self):
        return self._stigma_words

    @stigma_words.setter
    def stigma_words(self, words):
        self._stigma_words = ffilter(lambda w: w not in self._stop_words, map(self.stemmer, words))

    @property
    def null_words(self):
        return self._null_words

    @null_words.setter
    def null_words(self, words):
        self._null_words = ffilter(lambda w: w not in self._stop_words, map(self.stemmer, words))

    @property
    def keywords(self):
        return self._keywords

    @keywords.setter
    def keywords(self, words):
        self._keywords = ffilter(lambda w: w not in self._stop_words, map(self.stemmer, words))

TextRank

In [None]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import math
from collections import Counter, namedtuple
from operator import attrgetter

try:
    import numpy
except ImportError:
    numpy = None

def null_stemmer(word):
    return word

def to_unicode(text, encoding='utf-8', errors='strict'):
    if isinstance(text, bytes):
        return text.decode(encoding, errors=errors)
    return text

class ItemsCount(object):
    def __init__(self, count):
        self.count = count

    def __call__(self, seq):
        return seq[:self.count]

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))


class AbstractSummarizer(object):
    def __init__(self, stemmer=null_stemmer):
        if not callable(stemmer):
            raise ValueError("Stemmer has to be a callable object")

        self._stemmer = stemmer

    def __call__(self, document, sentences_count):
        raise NotImplementedError("This method should be overriden in subclass")

    def stem_word(self, word):
        return self._stemmer(self.normalize_word(word))

    @staticmethod
    def normalize_word(word):
        return to_unicode(word).lower()

    @staticmethod
    def _get_best_sentences(sentences, count, rating, *args, **kwargs):
        rate = rating
        if isinstance(rating, dict):
            assert not args and not kwargs
            def rate(s): return rating[s]

        infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
            for o, s in enumerate(sentences))

        # sort sentences by rating in descending order
        infos = sorted(infos, key=attrgetter("rating"), reverse=True)
        # get `count` first best rated sentences
        if not callable(count):
            count = ItemsCount(count)
        infos = count(infos)
        # sort sentences by their order in document
        infos = sorted(infos, key=attrgetter("order"))

        return tuple(i.sentence for i in infos)



class TextRankSummarizer(AbstractSummarizer):
    """An implementation of TextRank algorithm for summarization.

    Source: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
    """
    epsilon = 1e-4
    damping = 0.85
    # small number to prevent zero-division error, see https://github.com/miso-belica/sumy/issues/112
    _ZERO_DIVISION_PREVENTION = 1e-7
    _stop_words = frozenset()

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count):
        self._ensure_dependencies_installed()
        if not document.sentences:
            return ()

        ratings = self.rate_sentences(document)
        return self._get_best_sentences(document.sentences, sentences_count, ratings)

    @staticmethod
    def _ensure_dependencies_installed():
        if numpy is None:
            raise ValueError("TextRank summarizer requires NumPy. Please, install it by command 'pip install numpy'.")

    def rate_sentences(self, document):
        matrix = self._create_matrix(document)
        ranks = self.power_method(matrix, self.epsilon)
        print(ranks)
        return {sent: rank for sent, rank in zip(document.sentences, ranks)}

    def _create_matrix(self, document):
    
        sentences_as_words = [self._to_words_set(sent) for sent in document.sentences]
        sentences_count = len(sentences_as_words)
        weights = numpy.zeros((sentences_count, sentences_count))

        for i, words_i in enumerate(sentences_as_words):
            for j in range(i, sentences_count):
                rating = self._rate_sentences_edge(words_i, sentences_as_words[j])
                weights[i, j] = rating
                weights[j, i] = rating

        weights /= (weights.sum(axis=1)[:, numpy.newaxis] + self._ZERO_DIVISION_PREVENTION)

        return numpy.full((sentences_count, sentences_count), (1.-self.damping) / sentences_count) \
               + self.damping * weights

    def _to_words_set(self, sentence):
        words = map(self.normalize_word, sentence.words)
        return [self.stem_word(w) for w in words if w not in self._stop_words]

    @staticmethod
    def _rate_sentences_edge(words1, words2):
        KEYWORDS = {"need", "want", "have", "has", "should", "require", "demand", "must"}
        rank = sum(words2.count(w) for w in words1)
        if rank == 0:
            return 0.0

        # Bonus por presencia de palabras clave
        keyword_bonus = 0
        if any(w in KEYWORDS for w in words1 + words2):
            keyword_bonus = 2  # puedes ajustar este valor

        norm = math.log(len(words1)) + math.log(len(words2))
        if numpy.isclose(norm, 0.):
            return float(rank + keyword_bonus)
        else:
            return (rank + keyword_bonus) / norm


    @staticmethod
    def power_method(matrix, epsilon):
        transposed_matrix = matrix.T
        sentences_count = len(matrix)
        p_vector = numpy.array([1.0 / sentences_count] * sentences_count)
        lambda_val = 1.0

        while lambda_val > epsilon:
            next_p = numpy.dot(transposed_matrix, p_vector)
            lambda_val = numpy.linalg.norm(numpy.subtract(next_p, p_vector))
            p_vector = next_p

        return p_vector



Luhn

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import namedtuple
from operator import attrgetter

# Asegúrate de tener instalada la librería 'nltk' si planeas usar stemming
# Puedes instalarla con: pip install nltk
from nltk.stem import SnowballStemmer

# Clase auxiliar para el modelo de frecuencia de términos
 # Palabras con refuerzo especial
SPECIAL_TERMS = {"need", "want", "have", "has", "should", "require", "demand", "must"}
SPECIAL_TERM_WEIGHT = 3  # Puedes ajustar este valor según el impacto que quieras

class TfDocumentModel:
    def __init__(self, words):
        self._term_counts = {}
        for word in words:
            normalized = word.lower()
            base_weight = 1
            if normalized in SPECIAL_TERMS:
                base_weight = SPECIAL_TERM_WEIGHT
            self._term_counts[normalized] = self._term_counts.get(normalized, 0) + base_weight

    def term_frequency(self, term):
        return self._term_counts.get(term, 0)

    def most_frequent_terms(self, count):
        sorted_terms = sorted(self._term_counts.items(), key=lambda item: item[1], reverse=True)
        return [term for term, freq in sorted_terms[:count]]

# Función auxiliar para el stemming (si no se proporciona un stemmer)
def null_stemmer(word):
    return word

# Función auxiliar para convertir a unicode (si es necesario para Python 2)
def to_unicode(text):
    if isinstance(text, bytes):
        return text.decode('utf-8')
    return str(text)

# Clase auxiliar para contar items (si 'count' no es un entero)
class ItemsCount:
    def __init__(self, count):
        self.count = count

    def __call__(self, items):
        return items[:self.count]

SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",))

class AbstractSummarizer(object):
    def __init__(self, stemmer=null_stemmer):
        if not callable(stemmer):
            raise ValueError("Stemmer has to be a callable object")
        self._stemmer = stemmer

    def __call__(self, document, sentences_count):
        raise NotImplementedError("This method should be overriden in subclass")

    def stem_word(self, word):
        return self._stemmer(self.normalize_word(word))

    @staticmethod
    def normalize_word(word):
        return to_unicode(word).lower()

    @staticmethod
    def _get_best_sentences(sentences, count, rating, *args, **kwargs):
        rate = rating
        if isinstance(rating, dict):
            assert not args and not kwargs
            def rate(s): return rating[s]

        infos = (SentenceInfo(s, o, rate(s, *args, **kwargs))
                 for o, s in enumerate(sentences))

        # sort sentences by rating in descending order
        infos = sorted(infos, key=attrgetter("rating"), reverse=True)
        # get `count` first best rated sentences
        if not callable(count):
            count = ItemsCount(count)
        infos = count(infos)
        # sort sentences by their order in document
        infos = sorted(infos, key=attrgetter("order"))

        return tuple(i.sentence for i in infos)

class LuhnSummarizer(AbstractSummarizer):
    max_gap_size = 4
    # TODO: better recognition of significant words (automatic)
    significant_percentage = 1
    _stop_words = frozenset()

    @property
    def stop_words(self):
        return self._stop_words

    @stop_words.setter
    def stop_words(self, words):
        self._stop_words = frozenset(map(self.normalize_word, words))

    def __call__(self, document, sentences_count):
        words = self._get_significant_words(document.words)
        return self._get_best_sentences(document.sentences,
                                        sentences_count, self.rate_sentence, words)

    def _get_significant_words(self, words):
        words = map(self.normalize_word, words)
        words = tuple(self.stem_word(w) for w in words if w not in self._stop_words)

        model = TfDocumentModel(words)

        # take only best `significant_percentage` % words
        best_words_count = int(len(words) * self.significant_percentage)
        words = model.most_frequent_terms(best_words_count)

        # take only words contained multiple times in document
        return tuple(t for t in words if model.term_frequency(t) > 1)

    def rate_sentence(self, sentence, significant_stems):
        ratings = self._get_chunk_ratings(sentence, significant_stems)
        return max(ratings) if ratings else 0

    def _get_chunk_ratings(self, sentence, significant_stems):
        chunks = []
        NONSIGNIFICANT_CHUNK = [0]*self.max_gap_size

        in_chunk = False
        for order, word in enumerate(sentence.words):
            stem = self.stem_word(word)
            # new chunk
            if stem in significant_stems and not in_chunk:
                in_chunk = True
                chunks.append([1])
            # append word to chunk
            elif in_chunk:
                is_significant_word = int(stem in significant_stems)
                chunks[-1].append(is_significant_word)

            # end of chunk
            if chunks and chunks[-1][-self.max_gap_size:] == NONSIGNIFICANT_CHUNK:
                in_chunk = False

        return tuple(map(self._get_chunk_rating, chunks))

    def _get_chunk_rating(self, chunk):
        chunk = self.__remove_trailing_zeros(chunk)
        words_count = len(chunk)
        assert words_count > 0

        significant_words = sum(chunk)
        if significant_words == 1:
            return 0
        else:
            return significant_words**2 / words_count

    def __remove_trailing_zeros(self, collection):
        """Removes trailing zeroes from indexable collection of numbers"""
        index = len(collection) - 1
        while index >= 0 and collection[index] == 0:
            index -= 1
        return collection[:index + 1]
