In [1]:
%load_ext autoreload
%autoreload 2

# Text Summarizer

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import codecs

In [41]:
def read_article(file_name):
    with codecs.open(file_name, "r", encoding='utf-8') as file:  # handles accentuated characters
        filedata = file.readlines()
    
    article = " ".join(filedata).split(". ")    # split the text by sentences using ". "
    
    sentences = []
    for sentence in article:             # iterate thru sentences, printing each and generate list of wards for each sentence
        # print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))    # replace any non character by " "
    #sentences.pop()   ##### systematically eliminate last sentence of the text from the returned sentences??
    
    return sentences

In [42]:
def sentence_similarity(sentence_1, sentence_2, stopwords=None):
    if stopwords is None:
        stopwords = []     # create an empty list to avoid error below
 
    sentence_1 = [w.lower() for w in sentence_1]
    sentence_2 = [w.lower() for w in sentence_2]

    all_words = list(set(sentence_1 + sentence_2))  # create total vocabulary of unique words for the two sentences compared

    vector1 = [0] * len(all_words)                  # prepare one-hot vectors for each sentence over all vocab
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sentence_1:
        if w in stopwords:
            continue 
        vector1[all_words.index(w)] += 1           # list.index(element) returns the index of the given element in the list

    # build the vector for the second sentence
    for w in sentence_2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)   # Cosine = 0 for similar sentences => returns 1 if perfectly similar

In [43]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))  # create a square matrix with dim the num of sentences
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences (diagonal of the square matrix)
                continue
            # similarity of each sentence to all other sentences in the text is measured and logged in the matrix
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [44]:
def generate_summary(file_name, top_n=5, show=False):
    stop_words = stopwords.words('english')
    summarize_text = []
    
    # Step 1 - Read text and tokenize
    sentences =  read_article(file_name)
    print("number of sentences in text : ", len(sentences))
    
    # Step 2 - Generate Similary Matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    
    # Step 3 - Rank sentences in similarity matrix. let’s convert the similarity matrix into a graph. 
    # The nodes of this graph will represent the sentences and the edges will represent the similarity scores between
    # the sentences. On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    # Step 4 - Sort the rank and pick top sentences extract the top N sentences based on their rankings for summary generation
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    if show :
        print("Indexes of top ranked_sentence order are ", ranked_sentence)
    # extract the top N sentences based on their rankings for summary generation
    if len(ranked_sentence) < top_n:
        top_n = len(ranked_sentence)
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
    
    # Step 5 - Output the summarize text
    print("Summarize Text: \n", ". ".join(summarize_text)+'.')

# Bert

In [61]:
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer("all-MiniLM-L12-v2")

embeddings = model.encode([
    'I like the trains',
    'trains are better than cars',
    'airplaines are better than trains'
])

util.dot_score(embeddings, embeddings)

tensor([[1.0000, 0.6359, 0.5088],
        [0.6359, 1.0000, 0.6196],
        [0.5088, 0.6196, 1.0000]])

In [11]:
from summary.rouge import evaluate_rouge

sum1 = "today it was sunny and hot. The skay was blue. All was right"
exp11 = "today it was sunny and hot. The skay was blue. All was right"
exp12 = "today it was sunny and hot. All was right"

sum2 = "today it was cold and snowy. The skay was blue. All was right"
exp21 = "today it was sunny and hot. The skay was blue. All was right"
exp22 = "today it was sunny and hot. All was right"

evaluate_rouge(
    [sum1.split(". "), sum2.split(". ")],
    [[exp11.split(". "), exp12.split(". ")], [exp21.split(". "), exp22.split(". ")]],
)


temp\D2GIVW5LLF
temp\D2GIVW5LLF temp\D2GIVW5LLF\system temp\D2GIVW5LLF\model


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Zumo\\AppData\\Roaming\\pyrouge\\settings.ini'

In [76]:
model = SentenceTransformer("all-MiniLM-L12-v2")

def read_article_bert(file_name):
    with codecs.open(file_name, "r", encoding='utf-8') as file:
        filedata = file.readlines()
    
    article = " ".join(filedata).split(". ")
    return [sentence.replace("[^a-zA-Z]", " ") for sentence in article]

def generate_summary_bert(file_name, top_n=5, show=False):
    summarize_text = []
    
    # Step 1 - Read text and tokenize
    sentences =  read_article_bert(file_name)
    print("number of sentences in text : ", len(sentences))
    
    # Step 2 - Generate Similary Matrix across sentences
    encodings = model.encode(sentences, convert_to_tensor=True)
    sentence_similarity_matrix = util.dot_score(encodings, encodings) # type: ignore
    sentence_similarity_matrix = np.asarray(sentence_similarity_matrix)
    
    # Step 3 - Rank sentences in similarity matrix. let’s convert the similarity matrix into a graph. 
    # The nodes of this graph will represent the sentences and the edges will represent the similarity scores between
    # the sentences. On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    # Step 4 - Sort the rank and pick top sentences extract the top N sentences based on their rankings for summary generation
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    if show :
        print("Indexes of top ranked_sentence order are ", ranked_sentence)
    # extract the top N sentences based on their rankings for summary generation
    if len(ranked_sentence) < top_n:
        top_n = len(ranked_sentence)
        
        
    summarize_text = [ranked_sentence[i][1] for i in range(top_n)]

    
    # Step 5 - Output the summarize text
    print("Summarize Text: \n", ". ".join(summarize_text)+'.')

In [77]:
# let's begin
generate_summary("input/feedback-prize-2021/train/0000D23A521A.txt", 3)

number of sentences in text :  11
Summarize Text: 
 There is no good reason that NASA would hide life on Mars from the rest of the world.
 
 So, NASA is not hiding life on Mars from us, and they are not trying to trick us into thinking that the "face" on mars is just a mesa, because it actually is. Some people belive that the so called "face" on mars was created by life on mars. NASA hiding life would be illogical, because if they found life on Mars, they would make a lot of money, and we all know that the people at NASA aren't illogical people..


In [78]:
# let's begin
generate_summary_bert("input/feedback-prize-2021/train/0000D23A521A.txt", 3)

number of sentences in text :  11
Summarize Text: 
 There is no good reason that NASA would hide life on Mars from the rest of the world.
 
 So, NASA is not hiding life on Mars from us, and they are not trying to trick us into thinking that the "face" on mars is just a mesa, because it actually is. Some people belive that the so called "face" on mars was created by life on mars. This "face" on mars only looks like a face because humans tend to see faces wherever we look, humans are obviously extremely social, which is why our brain is designed to recognize faces.
 
 Many conspiracy theorists believe that NASA is hiding life on Mars from the rest of the world.


# Prove

In [81]:
# Step 1 - Read text and tokenize
sentences =  read_article_bert("input/feedback-prize-2021/train/0000D23A521A.txt")
print("number of sentences in text : ", len(sentences))

number of sentences in text :  11


In [83]:
# Step 2 - Generate Similary Matrix across sentences
encodings = model.encode(sentences, convert_to_tensor=True)
sentence_similarity_matrix = util.dot_score(encodings, encodings) # type: ignore
sentence_similarity_matrix = np.asarray(sentence_similarity_matrix)

sentence_similarity_matrix.shape

(11, 11)

In [84]:
# Step 3 - Rank sentences in similarity matrix. let’s convert the similarity matrix into a graph. 
# The nodes of this graph will represent the sentences and the edges will represent the similarity scores between
# the sentences. On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)

In [None]:
top_n = 3

# Step 4 - Sort the rank and pick top sentences extract the top N sentences based on their rankings for summary generation
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

# print("Indexes of top ranked_sentence order are ", ranked_sentence)
# extract the top N sentences based on their rankings for summary generation
if len(ranked_sentence) < top_n:
    top_n = len(ranked_sentence)

summarize_text = [ranked_sentence[i][1] for i in range(top_n)]

# Step 5 - Output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text)+'.')

# PacSum

In [None]:
from summary.pacsum import PacSumExtractorWithBert
from summary.rouge import evaluate_rouge

def extract_summary(extractor: PacSumExtractorWithBert, data_iterator):

    summaries = []
    references = []

    for item in data_iterator:
        article, abstract, inputs = item
        
        if len(article) <= self.extract_num:
            summaries.append(article)
            references.append([abstract])
            continue
        
        summary = extractor.extract_summary(article)
        summaries.append(summary)
        references.append([abstract])

    result = evaluate_rouge(summaries, references, remove_temp=True, rouge_args=[])