In [1]:
import pandas as pd
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from sumy.summarizers.text_rank import TextRankSummarizer



source:
- https://pypi.org/project/sumy/

In [2]:
# Read Dataframe
df = pd.read_csv('../data/Result/dataset.csv')

# Show Dataframe
df.head()

Unnamed: 0,Text,Class
0,I have heard nothing from the Ambassador about...,Political speech
1,I think it is in the public interest to procee...,Political speech
2,The A-11 aircraft now at Edwards Air force Bas...,Political speech
3,It is one of the most comprehensive bills in t...,Political speech
4,"So long as there remains a man without a job, ...",Political speech


In [3]:
# balancing dataset

import pandas as pd

# Count each categorie
category_counts = df['Class'].value_counts()

# Min number of the categories
min_count = min(category_counts)

balanced_dataset = pd.DataFrame()

# Iteration over each category
for category in category_counts.index:
    
    category_subset = df[df['Class'] == category].sample(n=100, random_state=42)
    
   
    balanced_dataset =pd.concat([category_subset, balanced_dataset], ignore_index=True)



# Check distribution of the categories
balanced_category_counts = balanced_dataset['Class'].value_counts()
print(balanced_category_counts)


Class
Jurisdiction        100
Political speech    100
Literature          100
Blog                100
News                100
Name: count, dtype: int64


In [4]:
balanced_dataset.head()

Unnamed: 0,Text,Class
0,"In other words, the notion of a category of hy...",Jurisdiction
1,(5) An offence the sentence for which is impos...,Jurisdiction
2,The Court of Justice rejected the contention. ...,Jurisdiction
3,In London Underground Ltd v Edwards (No 2) [19...,Jurisdiction
4,"On the question of jury tampering (to which, m...",Jurisdiction


In [5]:
import random 

# Initialize the summarizers
text_rank_summarizer = TextRankSummarizer()
lsa_summarizer = LsaSummarizer()
lex_rank_summarizer = LexRankSummarizer()

# Define the compression rates
compression_rates = [0.2, 0.4, 0.6, 0.8]  

# Initialize counter
approach_counts = {summarizer.__class__.__name__: 0 for summarizer in [text_rank_summarizer, lsa_summarizer, lex_rank_summarizer]}
compression_rate_counts = {rate: 0 for rate in compression_rates}

# Iterate over dataframe
for index, row in balanced_dataset.iterrows():
    approach = random.choice([text_rank_summarizer, lsa_summarizer, lex_rank_summarizer])
    compression_rate = random.choice(compression_rates)
    
    # Update counter
    approach_counts[approach.__class__.__name__] += 1
    compression_rate_counts[compression_rate] += 1

    # Tokenize the text
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(row['Text'])

    # Calculate the number of sentences for the chosen compression rate
    num_sentences = int(len(sentences) * compression_rate)

    # Generate the summary
    summarizer = approach
    parser = PlaintextParser.from_string(row['Text'], tokenizer)
    summary = summarizer(parser.document, num_sentences)

    # Store the summary in the Summary column
    balanced_dataset.loc[index, 'Summary'] = ' '.join(str(sentence) for sentence in summary)


total_summaries = len(balanced_dataset)

# Calculate the percentages
approach_percentages = {approach: (count / total_summaries) * 100 for approach, count in approach_counts.items()}
compression_rate_percentages = {rate: (count / total_summaries) * 100 for rate, count in compression_rate_counts.items()}

# results
for approach, percentage in approach_percentages.items():
    print(f"The approach {approach} was used for {percentage:.2f}% of the summaries.")

for rate, percentage in compression_rate_percentages.items():
    print(f"The compression rate {rate} was used for {percentage:.2f}% of the summaries.")

The approach TextRankSummarizer was used for 36.60% of the summaries.
The approach LsaSummarizer was used for 32.00% of the summaries.
The approach LexRankSummarizer was used for 31.40% of the summaries.
The compression rate 0.2 was used for 27.20% of the summaries.
The compression rate 0.4 was used for 25.00% of the summaries.
The compression rate 0.6 was used for 24.80% of the summaries.
The compression rate 0.8 was used for 23.00% of the summaries.


In [6]:
balanced_dataset.head()

Unnamed: 0,Text,Class,Summary
0,"In other words, the notion of a category of hy...",Jurisdiction,"In other words, the notion of a category of hy..."
1,(5) An offence the sentence for which is impos...,Jurisdiction,Schedule 15 contained a very substantial list ...
2,The Court of Justice rejected the contention. ...,Jurisdiction,It noted in the first place that the expressio...
3,In London Underground Ltd v Edwards (No 2) [19...,Jurisdiction,In London Underground Ltd v Edwards (No 2) [19...
4,"On the question of jury tampering (to which, m...",Jurisdiction,"On the question of jury tampering (to which, m..."


In [20]:
# Export the resulting dataframe to a csv-File in the 'data/Result/' folder
balanced_dataset.to_csv('../data/Result/reference_summaries_dataset.csv', index=False)