Summary

The provided code script reads blog text data, applies the TextRank algorithm for text summarization with different compression rates, and generates summaries for each blog. It then evaluates the generated summaries using Rouge metrics by comparing them to reference summaries. The script saves the generated summaries as a new CSV file and computes the minimum, mean, and maximum Rouge scores, providing an analysis of the summarization quality. Overall, the script performs text summarization and evaluates the effectiveness of the approach using Rouge metrics.

In [1]:
import pandas as pd
import random 
import os
from nltk.tokenize import sent_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer


In [2]:
!pip install datasets
import os
import re

from datasets import load_dataset
import pandas as pd

import requests
import json

from statistics import mean

import random
import csv
import nltk
from nltk.corpus import gutenberg

import random



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../data/Blogs_result/dataset.csv")
df = df.drop('Class', axis=1)

In [4]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
text_rank_summarizer = TextRankSummarizer()

compression_rates = [0.1, 0.3, 0.5, 0.7]  

approach_counts = {summarizer.__class__.__name__: 0 for summarizer in [text_rank_summarizer]}
compression_rate_counts = {rate: 0 for rate in compression_rates}

for index, row in df.iterrows():
    approach = random.choice([text_rank_summarizer])
    compression_rate = random.choice(compression_rates)
    
    approach_counts[approach.__class__.__name__] += 1
    compression_rate_counts[compression_rate] += 1

    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(row['Text'])

    num_sentences = int(len(sentences) * compression_rate)

    summarizer = approach
    parser = PlaintextParser.from_string(row['Text'], tokenizer)
    summary = summarizer(parser.document, num_sentences)

    df.loc[index, 'Summary'] = ' '.join(str(sentence) for sentence in summary)


total_summaries = len(df)

approach_percentages = {approach: (count / total_summaries) * 100 for approach, count in approach_counts.items()}
compression_rate_percentages = {rate: (count / total_summaries) * 100 for rate, count in compression_rate_counts.items()}

for approach, percentage in approach_percentages.items():
    print(f"The approach {approach} was used for {percentage:.2f}% of the summaries.")

for rate, percentage in compression_rate_percentages.items():
    print(f"The compression rate {rate} was used for {percentage:.2f}% of the summaries.")

The approach TextRankSummarizer was used for 100.00% of the summaries.
The compression rate 0.1 was used for 24.51% of the summaries.
The compression rate 0.3 was used for 24.14% of the summaries.
The compression rate 0.5 was used for 25.24% of the summaries.
The compression rate 0.7 was used for 26.10% of the summaries.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4528 entries, 0 to 4527
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     4528 non-null   object
 1   Summary  4528 non-null   object
dtypes: object(2)
memory usage: 70.9+ KB


In [8]:
df.head()

Unnamed: 0,Text,Summary
0,I have heard nothing from the Ambassador about...,"Q. Mr. President, in answering an earlier ques..."
1,I think it is in the public interest to procee...,Intensive negotiation--day and night negotiati...
2,The A-11 aircraft now at Edwards Air force Bas...,"for example, one of the most important technic..."
3,It is one of the most comprehensive bills in t...,It is one of the most comprehensive bills in t...
4,"So long as there remains a man without a job, ...","So long as there remains a man without a job, ..."


In [9]:
import pandas as pd

folder_path = '../data/Blogs_result/'

file_name = 'blogsresult_just_textrank.csv'

file_path = folder_path + '/' + file_name

df.to_csv(file_path, index=False)

print(f"DataFrame saved as CSV file at: {file_path}")

DataFrame saved as CSV file at: ../data/Blogs_result//blogsresult_just_textrank.csv


# Evaluierung

In [10]:
df1 = pd.read_csv("../data/Blogs_result/blogsresult_just_textrank.csv")
df2 = pd.read_csv("../data/Blogs_result/blogsresult_refsummarys.csv")


In [18]:
from rouge import Rouge

rouge = Rouge()

rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for idx, row1 in df1.iterrows():
    summary1 = str(row1['Summary'])
    
    row2 = df2.loc[idx]
    summary2 = str(row2['Summary'])
    
    scores = rouge.get_scores(summary1, summary2)[0]
    
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)

min_rouge_1 = min(rouge_1_scores)
mean_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
max_rouge_1 = max(rouge_1_scores)

min_rouge_2 = min(rouge_2_scores)
mean_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
max_rouge_2 = max(rouge_2_scores)

min_rouge_l = min(rouge_l_scores)
mean_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
max_rouge_l = max(rouge_l_scores)

print("Rouge-1 scores:")
print(f"Min: {min_rouge_1}")
print(f"Mean: {mean_rouge_1}")
print(f"Max: {max_rouge_1}")

print("Rouge-2 scores:")
print(f"Min: {min_rouge_2}")
print(f"Mean: {mean_rouge_2}")
print(f"Max: {max_rouge_2}")

print("Rouge-L scores:")
print(f"Min: {min_rouge_l}")
print(f"Mean: {mean_rouge_l}")
print(f"Max: {max_rouge_l}")

Rouge-1 scores:
Min: 0.0
Mean: 0.6212928899612717
Max: 0.999999995
Rouge-2 scores:
Min: 0.0
Mean: 0.5479318105729233
Max: 0.999999995
Rouge-L scores:
Min: 0.0
Mean: 0.615398270271077
Max: 0.999999995
