In [None]:
import pandas as pd
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from sumy.summarizers.text_rank import TextRankSummarizer

In [60]:
# Read Dataframe
df = pd.read_csv('/content/sample_data/datensatz.csv')

# Show Dataframe
df.head()

Unnamed: 0,Text,Zusammenfassung,Kategorie
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business


In [61]:
# balancing dataset

import pandas as pd

# Count each categorie
category_counts = df['Kategorie'].value_counts()

# Min number of the categories
min_count = min(category_counts)

balanced_dataset = pd.DataFrame()

# Iteration over each category
for category in category_counts.index:
    # Filter the rows of the dataset that belong to the current category
    category_subset = df[df['Kategorie'] == category].sample(n=min_count, random_state=42)
    
    # Add the selected rows to the balanced dataset
    balanced_dataset = balanced_dataset.append(category_subset)



# Check distribution of the categories
balanced_category_counts = balanced_dataset['Kategorie'].value_counts()
print(balanced_category_counts)


sport            386
business         386
politics         386
tech             386
entertainment    386
Name: Kategorie, dtype: int64


  balanced_dataset = balanced_dataset.append(category_subset)
  balanced_dataset = balanced_dataset.append(category_subset)
  balanced_dataset = balanced_dataset.append(category_subset)
  balanced_dataset = balanced_dataset.append(category_subset)
  balanced_dataset = balanced_dataset.append(category_subset)


In [62]:
df = df.drop(columns=['Zusammenfassung'])
df.head()

Unnamed: 0,Text,Kategorie
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [67]:
import random 

# Initialize the summarizers
text_rank_summarizer = TextRankSummarizer()
lsa_summarizer = LsaSummarizer()
lex_rank_summarizer = LexRankSummarizer()

# Define the compression rates
compression_rates = [0.1, 0.3, 0.5, 0.7]  # Adjust the values as desired

# Initialize counter
approach_counts = {summarizer.__class__.__name__: 0 for summarizer in [text_rank_summarizer, lsa_summarizer, lex_rank_summarizer]}
compression_rate_counts = {rate: 0 for rate in compression_rates}

# Iterate over dataframe
for index, row in df.iterrows():
    approach = random.choice([text_rank_summarizer, lsa_summarizer, lex_rank_summarizer])
    compression_rate = random.choice(compression_rates)
    
    # Update counter
    approach_counts[approach.__class__.__name__] += 1
    compression_rate_counts[compression_rate] += 1

    # Tokenize the text
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences(row['Text'])

    # Calculate the number of sentences for the chosen compression rate
    num_sentences = int(len(sentences) * compression_rate)

    # Generate the summary
    summarizer = approach
    parser = PlaintextParser.from_string(row['Text'], tokenizer)
    summary = summarizer(parser.document, num_sentences)

    # Store the summary in the "Summary" column
    df.loc[index, 'Summary'] = ' '.join(str(sentence) for sentence in summary)

# Calculate total summaries
total_summaries = len(df)

# Calculate the percentages
approach_percentages = {approach: (count / total_summaries) * 100 for approach, count in approach_counts.items()}
compression_rate_percentages = {rate: (count / total_summaries) * 100 for rate, count in compression_rate_counts.items()}

# Print the results
for approach, percentage in approach_percentages.items():
    print(f"The approach {approach} was used for {percentage:.2f}% of the summaries.")

for rate, percentage in compression_rate_percentages.items():
    print(f"The compression rate {rate} was used for {percentage:.2f}% of the summaries.")


The approach TextRankSummarizer was used for 35.55% of the summaries.
The approach LsaSummarizer was used for 31.82% of the summaries.
The approach LexRankSummarizer was used for 32.63% of the summaries.
The compression rate 0.1 was used for 24.00% of the summaries.
The compression rate 0.3 was used for 23.42% of the summaries.
The compression rate 0.5 was used for 26.07% of the summaries.
The compression rate 0.7 was used for 26.52% of the summaries.


In [68]:
df.head()

Unnamed: 0,Text,Kategorie,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,"The firm, which is now one of the biggest inve..."
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,The owners of embattled Russian oil giant Yuko...
3,High fuel prices hit BA's profits\n\nBritish A...,business,Reporting its results for the three months to ...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,Shares in UK drinks and food firm Allied Domec...
