In [None]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda-autodetect,transformers,lookups]'
!python -m spacy download en_core_web_sm

In [None]:
!pip install en_core_web_sm

In [3]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq




In [5]:
# Function for Tokenization, Remove stopwords, Lowercasing, Lemmatization, Remove punctuation 
def preprocess_text(text):
    # Tokenization
    doc = nlp(text)
    
    # Remove stopwords, Lowercasing, Lemmatization, Remove punctuation 
    preprocessed_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    
    # Return the preprocessed tokens as text
    preprocessed_text = " ".join(preprocessed_tokens)
    return preprocessed_text




In [23]:
def lsa_summarizer(text, compression_rate):
    # Split text sentence
    sentences = text.split('. ') 
    
    # Text Vectorization
    vectorizer = CountVectorizer()
    term_document_matrix = vectorizer.fit_transform(sentences)
    
    # LSA-Model
    num_components = max(int(len(sentences) * compression_rate), 1)
    lsa_model = TruncatedSVD(n_components=num_components)
    lsa_matrix = lsa_model.fit_transform(term_document_matrix)
    
    # Ranking sentences
    sentence_scores = lsa_matrix.sum(axis=1)
    
    # Select sentences
    num_sentences = max(int(len(sentences) * compression_rate), 1)
    top_sentences = heapq.nlargest(num_sentences, range(len(sentences)), key=sentence_scores.__getitem__)
    
    # Create summary
    summary_sentences = [sentences[idx] for idx in top_sentences]
    summary = '. '.join(summary_sentences)
    
    return summary


In [7]:
# Test
text = """
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding.

Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.

TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.

"""



compression_rate = 0.5

summary = lsa_summarizer(text, compression_rate)
print(summary)

But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. 
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connect

In [14]:
# Evaluation

# Data
import pandas as pd

# Read Dataframe
df = pd.read_csv('/content/sample_data/BBC_Dataset_concatenated.csv')

# Show Dataframe
df.head()
     

Unnamed: 0,Text,Zusammenfassung,Kategorie
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business


In [None]:
# Data set for evaluation

!pip install datasets
import os
import re

from datasets import load_dataset
import pandas as pd

import requests
import json

from statistics import mean

import random
import csv
import nltk
from nltk.corpus import gutenberg

import random

In [None]:
!pip install nltk
!pip install rouge-score
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [19]:
from rouge_score import rouge_scorer


In [26]:
def evaluate_rouge_scores(data_path):
    data = pd.read_csv(data_path)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    for _, row in data.iterrows():
        text = row['Text']
        reference_summary = row['Zusammenfassung']

        # Vorverarbeitung des Textes
        preprocessed_text = preprocess_text(text)

        # Generierung der Zusammenfassung
        summary = lsa_summarizer(preprocessed_text, compression_rate=0.5)

        # Berechnung des ROUGE-Scores
        rouge = scorer.score(summary, reference_summary)
        
        # ROUGE-Werte hinzufügen
        rouge_scores['rouge1'].append(rouge['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(rouge['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(rouge['rougeL'].fmeasure)

    return rouge_scores


In [27]:
data_path = '/content/sample_data/BBC_Dataset_concatenated.csv'
scores = evaluate_rouge_scores(data_path)

# Ausgabe der ROUGE-Scores
for metric, values in scores.items():
    print(f"ROUGE-{metric} Score:")
    print("Mean:", np.mean(values))
    print("Min:", np.min(values))
    print("Max:", np.max(values))
    print()

  self.explained_variance_ratio_ = exp_var / full_var


ROUGE-rouge1 Score:
Mean: 0.3668717059858801
Min: 0.16666666666666666
Max: 0.7031250000000001

ROUGE-rouge2 Score:
Mean: 0.13547478812314928
Min: 0.0
Max: 0.5846153846153846

ROUGE-rougeL Score:
Mean: 0.22866885625422445
Min: 0.08860759493670886
Max: 0.5536332179930795

