In [83]:
# !pip install sacrebleu --quiet
# !pip install sentence-transformers --quiet
# !pip install rouge-score --quiet
# !pip install streamlit --quiet
# !pip install flask --quiet
# !pip install tf-keras --quiet

In [84]:

from string import punctuation

import networkx as nx
import nltk
import spacy
from nltk.corpus import stopwords
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [85]:
paragraph = '''
Pennsylvania Gov. Josh Shapiro said former President Donald Trump has left the Butler, Pennsylvania, area following the rally shooting Saturday.

“Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area,” Shapiro said in a statement posted to X. “Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery.”

“We mourn the loss of life and pray for the two victims who are being treated at this time,” he added. “I am grateful for all law enforcement who responded, protected the former president, and worked to bring the situation under control.”

Federal law enforcement officials will continue to lead on the investigation into the shooting, Shapiro said. Meanwhile, Pennsylvania State Police will lead the investigation into the shooting of the other victims. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who “offered his full support.”

Shapiro said he knows “how painful and shocking this event is to so many of our fellow Pennsylvanians.” He asked “that we treat our fellow Americans with respect and join together to universally condemn the unacceptable violence we witnessed earlier today in Butler.”
'''

In [86]:
gpt_summary = '''Pennsylvania Governor Josh Shapiro announced that former President Donald Trump has safely left the Butler area following a rally shooting. Shapiro expressed gratitude for Trump’s protection by the Secret Service and Pennsylvania State Police and mourned the loss of two victims. He noted that federal and state authorities are investigating the incident and emphasized the need for respect and condemnation of the violence.'''

In [87]:
stop_words = set(stopwords.words('english') + list(punctuation))

In [88]:
def clean_text(text):
    text = text.strip()

    # remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # remove \n and quotes like “, ‘, ”
    text = re.sub(r'[\n“”‘’]', '', text)

    return text

### Extractive text summarization using word frequency

In [89]:
def summarize_text(text):
    text = clean_text(text)
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize the text into words and filter out stopwords and punctuation
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]

    # Calculate word frequencies
    word_frequencies = {}
    for word in filtered_words:
        if word not in word_frequencies:
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

    # Calculate sentence scores
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_frequencies[word]
                else:
                    sentence_scores[sentence] += word_frequencies[word]

    # Get the top 30% of sentences with the highest scores
    summary_sentences = heapq.nlargest(int(len(sentences) * 0.3), sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [90]:
freq_summary = summarize_text(paragraph)
freq_summary

'Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area, Shapiro said in a statement posted to X. Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery. Josh Shapiro said former President Donald Trump has left the Butler, Pennsylvania, area following the rally shooting Saturday. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who offered his full support.'

### Extractive text summarization using TF-IDF

In [91]:
def compute_tfidf_scores(text):
    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
    vectors = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    # dense =
    dense_list = vectors.todense().tolist()
    tfidf_scores = dict(zip(feature_names, dense_list[0]))
    return tfidf_scores


def summarize_text_with_tfidf(text, summary_ratio=0.3):
    text = clean_text(text)
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    # Compute TF-IDF scores for words in the text
    tfidf_scores = compute_tfidf_scores(text)

    # Calculate sentence scores
    sentence_scores = {}
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        sentence_score = sum(tfidf_scores.get(word, 0) for word in words)
        sentence_scores[sentence] = sentence_score / len(words)  # Normalize by sentence length

    # Get the top `summary_ratio`% of sentences with the highest scores
    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = heapq.nlargest(num_summary_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary

In [92]:
tfidf_summary = summarize_text_with_tfidf(paragraph)
tfidf_summary

'Pennsylvania Gov. Josh Shapiro said former President Donald Trump has left the Butler, Pennsylvania, area following the rally shooting Saturday. Federal law enforcement officials will continue to lead on the investigation into the shooting, Shapiro said.'

### Extractive text summarization using transformer

In [93]:
model = SentenceTransformer('all-MiniLM-L6-v2')


def compute_sentence_embeddings(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings


def summarize_text_with_transformer(text, summary_ratio=0.3):
    text = clean_text(text)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    # Compute sentence embeddings
    sentence_embeddings = compute_sentence_embeddings(sentences)

    # Compute pairwise sentence similarities
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_score = util.pytorch_cos_sim(sentence_embeddings[i], sentence_embeddings).sum().item()
        sentence_scores[sentence] = sentence_score / len(sentences)  # Normalize by number of sentences

    # Get the top `summary_ratio`% of sentences with the highest scores
    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = heapq.nlargest(num_summary_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary



In [94]:
transformer_summary = summarize_text_with_transformer(paragraph)
transformer_summary

'Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area, Shapiro said in a statement posted to X. Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who offered his full support. Shapiro said he knows how painful and shocking this event is to so many of our fellow Pennsylvanians.'

### Extractive text summarization using NER

In [95]:
nlp = spacy.load('en_core_web_sm')


def score_with_ner(sentence, nlp_model):
    doc = nlp_model(sentence)
    return len([ent for ent in doc.ents])


def summarize_text_with_ner(text, summary_ratio=0.3):
    text = clean_text(text)

    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    sentence_embeddings = compute_sentence_embeddings(sentences)
    sentence_scores = {}

    for i, sentence in enumerate(sentences):
        ner_score = score_with_ner(sentence, nlp)
        sentence_score = util.pytorch_cos_sim(sentence_embeddings[i], sentence_embeddings).sum().item()
        sentence_scores[sentence] = (sentence_score + ner_score) / len(sentences)

    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = heapq.nlargest(num_summary_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


In [96]:
ner_summary = summarize_text_with_ner(paragraph)
ner_summary

'Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area, Shapiro said in a statement posted to X. Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery. Josh Shapiro said former President Donald Trump has left the Butler, Pennsylvania, area following the rally shooting Saturday. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who offered his full support.'

### Extractive text summarization using TextRank

In [97]:
def summarize_text_with_textrank(text, summary_ratio=0.3):
    text = clean_text(text)

    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    # Compute sentence embeddings
    sentence_embeddings = model.encode(sentences)

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(sentence_embeddings)

    # Build the similarity graph
    similarity_graph = nx.from_numpy_array(similarity_matrix)

    # Apply PageRank algorithm
    scores = nx.pagerank(similarity_graph)

    # Rank sentences by their scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Select the top `summary_ratio`% sentences
    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = [s for score, s in ranked_sentences[:num_summary_sentences]]
    summary = ' '.join(summary_sentences)
    return summary


In [98]:
textrank_summary = summarize_text_with_textrank(paragraph)
textrank_summary

'Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area, Shapiro said in a statement posted to X. Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery. Shapiro said he knows how painful and shocking this event is to so many of our fellow Pennsylvanians. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who offered his full support.'

### Extractive text summarization using redundancy removal

In [99]:
def redundancy_removal(selected_sentences, new_sentence, model, threshold=0.7):
    new_embedding = model.encode([new_sentence])[0]
    for sent in selected_sentences:
        sent_embedding = model.encode([sent])[0]
        if cosine_similarity([new_embedding], [sent_embedding])[0][0] > threshold:
            return False
    return True


def summarize_text_with_redundancy_removal(text, summary_ratio=0.3, redundancy_threshold=0.7):
    text = clean_text(text)

    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    sentence_embeddings = model.encode(sentences)
    similarity_matrix = cosine_similarity(sentence_embeddings)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = []
    for score, sentence in ranked_sentences:
        if len(summary_sentences) < num_summary_sentences and redundancy_removal(summary_sentences, sentence, model,
                                                                                 redundancy_threshold):
            summary_sentences.append(sentence)

    summary = ' '.join(summary_sentences)
    return summary


In [100]:
redundancy_removal_summary = summarize_text_with_redundancy_removal(paragraph)
redundancy_removal_summary

'Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, former President Trump has now left the Butler area, Shapiro said in a statement posted to X. Lori and I are thankful that his team reports that he is fine and we continue to wish him a full and speedy recovery. Shapiro said he knows how painful and shocking this event is to so many of our fellow Pennsylvanians. Shapiro said he has been communicating with law enforcement on the ground in Pennsylvania and has spoken with President Joe Biden, who offered his full support.'

### Extractive text summarization using Bert

In [102]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


def summarize_text_with_bert(text, summary_ratio=0.3):
    max_length = int(len(text.split()) * summary_ratio)
    min_length = max(5, int(max_length * 0.3))
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text'].strip()

In [103]:
bert_summary = summarize_text_with_bert(paragraph)
bert_summary

'Gov. Josh Shapiro says former President Trump has left the Butler, Pennsylvania, area following the shooting . Shapiro: "Under the protection of US Secret Service and with the assistance of the Pennsylvania State Police, the former president has now left the area . Shapiro said he has been communicating with law enforcement on the ground in'

### Evaluation of the summaries

In [104]:
def rouge_evaluate_summary(reference, summary) -> float:
    _scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return _scorer.score(reference, summary)


from nltk.translate.bleu_score import sentence_bleu


def blue_evaluate_summary(reference, summary):
    return sentence_bleu(reference, summary)
    # return sacrebleu.corpus_bleu(summary, [reference])


def evaluate_all_summaries(reference, summaries):
    evaluation_results = []
    for method, summary in summaries.items():
        _rouge_scores = rouge_evaluate_summary(reference, summary)

        evaluation_results.append({
            'method': method,
            'rouge1': round(_rouge_scores['rouge1'].fmeasure, 4),
            'rouge2': round(_rouge_scores['rouge2'].fmeasure, 4),
            'rougeL': round(_rouge_scores['rougeL'].fmeasure, 4),
        })
    return evaluation_results

In [105]:
all_summaries = {
    'Frequency Summary': freq_summary,
    'Tfidf Summary': tfidf_summary,
    'Transformer Summary': transformer_summary,
    'Ner Summary': ner_summary,
    'Textrank Summary': textrank_summary,
    'Redundancy Removal Summary': redundancy_removal_summary,
    'Bert Summary': bert_summary,
}

all_summaries_functions = {
    'Frequency Summary': summarize_text,
    'Tfidf Summary': summarize_text_with_tfidf,
    'Transformer Summary': summarize_text_with_transformer,
    'Ner Summary': summarize_text_with_ner,
    'Textrank Summary': summarize_text_with_textrank,
    'Redundancy Removal Summary': summarize_text_with_redundancy_removal,
    'Bert Summary': summarize_text_with_bert,
}

In [106]:
scores_rouge = evaluate_all_summaries(paragraph, all_summaries)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [107]:
import pandas as pd

df = pd.DataFrame(scores_rouge)

df

Unnamed: 0,method,rouge1,rouge2,rougeL,bleu
0,Frequency Summary,0.6312,0.6164,0.5563,0.0
1,Tfidf Summary,0.2824,0.2688,0.2824,0.0
2,Transformer Summary,0.6312,0.6226,0.6312,0.0
3,Ner Summary,0.6312,0.6164,0.5563,0.0
4,Textrank Summary,0.6312,0.6164,0.5437,0.0
5,Redundancy Removal Summary,0.6312,0.6164,0.5437,0.0
6,Bert Summary,0.3883,0.3469,0.3736,0.0


In [108]:
def get_best_method(metric):
    # Find the best method based on the chosen metric
    _best_method_name = df.loc[df[metric].idxmax(), 'method']

    # Retrieve the corresponding summary function
    _best_summary_func = all_summaries_functions.get(_best_method_name)

    return _best_method_name, _best_summary_func


chosen_metric = 'rougeL'
best_extractive_method_name, best_extractive_summary_func = get_best_method(chosen_metric)

print(f"Best method based on {chosen_metric}: {best_extractive_method_name}")

Best method based on rougeL: Transformer Summary


In [119]:
best_extractive_method_name

'Transformer Summary'

In [2]:
%%writefile main.py
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer, util
import heapq
from nltk.tokenize import sent_tokenize, word_tokenize

app = Flask(__name__)

model = SentenceTransformer('all-MiniLM-L6-v2')


def clean_text(text):
    text = text.strip()

    # remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # remove \n and quotes like “, ‘, ”
    text = re.sub(r'[\n“”‘’]', '', text)

    return text


def compute_sentence_embeddings(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings


def summarize_text_with_transformer(text, summary_ratio=0.3):
    text = clean_text(text)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    # Compute sentence embeddings
    sentence_embeddings = compute_sentence_embeddings(sentences)

    # Compute pairwise sentence similarities
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_score = util.pytorch_cos_sim(sentence_embeddings[i], sentence_embeddings).sum().item()
        sentence_scores[sentence] = sentence_score / len(sentences)  # Normalize by number of sentences

    # Get the top `summary_ratio`% of sentences with the highest scores
    num_summary_sentences = max(1, int(len(sentences) * summary_ratio))
    summary_sentences = heapq.nlargest(num_summary_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary


@app.route("/")
def hello_world():
    return "<p>Hello, World!</p>"


@app.route('/process', methods=['POST'])
def process_text():
    data = request.json
    text = data.get('paragraph')

    if not text:
        return jsonify({'error': 'Missing text or metric'}), 400

    if not summarize_text_with_transformer:
        return jsonify({'error': 'Invalid metric'}), 400

    summary = summarize_text_with_transformer(text)
    return jsonify({
        'summary': summary,
    })


if __name__ == '__main__':
    app.run(debug=True)

Overwriting main.py


In [120]:
# !flask --app main run

In [143]:
%%writefile app.py
import streamlit as st
import requests
import PyPDF2
from io import BytesIO
import re


# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.read()))
    c = 0
    text = ""

    while c < len(pdf_reader.pages):
        pageObj = pdf_reader.pages[c]
        # text += pageObj.extract_text()
        page_text = pageObj.extract_text()
        if page_text:
            # Add line breaks based on common patterns
            page_text = re.sub(r'\n+', '\n', page_text)
            text += page_text + "\n"
        c += 1

    return text.replace('\n', ' ')


# Title of the app
st.title("Text Summarization App")

# Option to upload PDF or enter text
option = st.radio("Choose input method", ["Upload PDF", "Enter Text"])

# Handle PDF upload
if option == "Upload PDF":
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
    if uploaded_file is not None:
        # Extract text from PDF
        pdf_text = extract_text_from_pdf(uploaded_file)
        st.text_area("Extracted Text", pdf_text, height=130, )
        text_to_process = pdf_text
    else:
        text_to_process = ""

# Handle direct text input
elif option == "Enter Text":
    text_to_process = st.text_area("Enter your paragraph to summarize here:", height=100)

# Button to process the text
if st.button("Summarize Text"):
    if text_to_process:
        response = requests.post(
            'http://127.0.0.1:5000/process',
            json={'paragraph': text_to_process}
        )

        if response.status_code == 200:
            result = response.json()
            st.subheader("Extractive Summary:")
            st.write(result.get('summary'))
        else:
            st.error(f"Error: {response.json().get('error')}")

        abstractive_text = "This is the abstractive summary based on the input text."

        st.subheader("Abstractive Text")
        st.write(abstractive_text)
    else:
        st.error("Please enter some text before processing.")


Overwriting app.py


In [136]:
# !streamlit run app.py