In [None]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda-autodetect,transformers,lookups]'
!python -m spacy download en_core_web_sm

In [None]:
!pip install en_core_web_sm

sources: https://iq.opengenus.org/latent-semantic-analysis-for-text-summarization/


In [3]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import heapq




In [5]:
# Function for Tokenization, Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
def preprocess_text(text):
    # Tokenization
    doc = nlp(text)

    # Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
    preprocessed_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Return the preprocessed tokens as text
    preprocessed_text = " ".join(preprocessed_tokens)
    return preprocessed_text




In [6]:
def lsa_summarizer(text, compression_rate):
    # Split text sentence
    sentences = text.split('. ')

    # Text Vectorization
    vectorizer = CountVectorizer()
    term_document_matrix = vectorizer.fit_transform(sentences)

    # LSA-Model
    num_components = max(int(len(sentences) * compression_rate), 1)
    lsa_model = TruncatedSVD(n_components=num_components)
    lsa_matrix = lsa_model.fit_transform(term_document_matrix)

    # Ranking sentences
    sentence_scores = lsa_matrix.sum(axis=1)

    # Select sentences
    num_sentences = max(int(len(sentences) * compression_rate), 1)
    top_sentences = heapq.nlargest(num_sentences, range(len(sentences)), key=sentence_scores.__getitem__)

    # Create summary
    summary_sentences = [sentences[idx] for idx in top_sentences]
    summary = '. '.join(summary_sentences)

    return summary


In [7]:
# Test
text = """
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding.

Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.

TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.

"""



compression_rate = 0.5

summary = lsa_summarizer(text, compression_rate)
print(summary)

But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. 
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connect

In [19]:
from rouge import Rouge
import pandas as pd

def evaluate_rouge_scores(df):
    rouge = Rouge()

    rouge_scores = {
        'all_classes': {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }
    }

    # Get unique classes
    classes = df['Class'].unique()

    for cls in classes:
        rouge_scores[cls] = {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }

    for _, row in df.iterrows():
        text = row['Text']
        reference_summary = row['Summary']
        cls = row['Class']

        # preprocessing
        preprocessed_text = preprocess_text(text)
        preprocessed_reference_summary = preprocess_text(reference_summary)

        # Generate summary with compression rate
        compression_rate = len(preprocessed_reference_summary.split()) / len(preprocessed_text.split())
        summary = lsa_summarizer(preprocessed_text, compression_rate=compression_rate)

        # calculate rouge 1
        rouge_scores_all = rouge.get_scores(summary, reference_summary)[0]
        rouge_scores_cls = rouge_scores[cls]

        # Add rouge score values
        rouge_scores_all_cls = rouge_scores_cls['rouge-1']
        rouge_scores_all_cls['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_cls['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_cls['r'].append(rouge_scores_all['rouge-1']['r'])

        # Add rouge score values for all classes
        rouge_scores_all_all = rouge_scores['all_classes']['rouge-1']
        rouge_scores_all_all['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_all['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_all['r'].append(rouge_scores_all['rouge-1']['r'])

    return rouge_scores


In [15]:
# Data
import pandas as pd

# Read Dataframe
df = pd.read_csv('/content/sample_data/Dataset_reference_summaries.csv')

# Show Dataframe
df.head()



Unnamed: 0,Text,Class,Summary
0,I have heard nothing from the Ambassador about...,Political speech,"I fully covered, in my conference last week, m..."
1,I think it is in the public interest to procee...,Political speech,I think it is in the public interest to procee...
2,The A-11 aircraft now at Edwards Air force Bas...,Political speech,"for example, one of the most important technic..."
3,It is one of the most comprehensive bills in t...,Political speech,I hope that we can work toward the goal of som...
4,"So long as there remains a man without a job, ...",Political speech,The contest in which South Viet-Nam is now eng...


In [23]:
# make sure all strings
df['Text'] = df['Text'].astype(str)
df['Summary'] = df['Summary'].astype(str)



In [None]:
rouge_scores = evaluate_rouge_scores(df)

# calculate ROUGE-1-Scores for all classes
print("ROUGE-1 Scores für alle Klassen:")
print(rouge_scores['all_classes']['rouge-1'])

# calculate ROUGE-1-Scores for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        print(scores['rouge-1'])


In [29]:
# calculate avg, max, min for all classes
print("ROUGE-1 Scores für alle Klassen:")
all_classes_scores = rouge_scores['all_classes']['rouge-1']
all_classes_f1_avg = sum(all_classes_scores['f']) / len(all_classes_scores['f'])
all_classes_p_avg = sum(all_classes_scores['p']) / len(all_classes_scores['p'])
all_classes_r_avg = sum(all_classes_scores['r']) / len(all_classes_scores['r'])
all_classes_f1_min = min(all_classes_scores['f'])
all_classes_p_min = min(all_classes_scores['p'])
all_classes_r_min = min(all_classes_scores['r'])
all_classes_f1_max = max(all_classes_scores['f'])
all_classes_p_max = max(all_classes_scores['p'])
all_classes_r_max = max(all_classes_scores['r'])
print(f"F1-Score Durchschnitt für alle Klassen: {all_classes_f1_avg:.2f} (Min: {all_classes_f1_min:.2f}, Max: {all_classes_f1_max:.2f})")
print(f"Precision Durchschnitt für alle Klassen: {all_classes_p_avg:.2f} (Min: {all_classes_p_min:.2f}, Max: {all_classes_p_max:.2f})")
print(f"Recall Durchschnitt für alle Klassen: {all_classes_r_avg:.2f} (Min: {all_classes_r_min:.2f}, Max: {all_classes_r_max:.2f})")

# calculate avg, max, min for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        class_scores = scores['rouge-1']
        class_f1_avg = sum(class_scores['f']) / len(class_scores['f'])
        class_p_avg = sum(class_scores['p']) / len(class_scores['p'])
        class_r_avg = sum(class_scores['r']) / len(class_scores['r'])
        class_f1_min = min(class_scores['f'])
        class_p_min = min(class_scores['p'])
        class_r_min = min(class_scores['r'])
        class_f1_max = max(class_scores['f'])
        class_p_max = max(class_scores['p'])
        class_r_max = max(class_scores['r'])
        print(f"F1-Score Durchschnitt für Klasse {cls}: {class_f1_avg:.2f} (Min: {class_f1_min:.2f}, Max: {class_f1_max:.2f})")
        print(f"Precision Durchschnitt für Klasse {cls}: {class_p_avg:.2f} (Min: {class_p_min:.2f}, Max: {class_p_max:.2f})")
        print(f"Recall Durchschnitt für Klasse {cls}: {class_r_avg:.2f} (Min: {class_r_min:.2f}, Max: {class_r_max:.2f})")


ROUGE-1 Scores für alle Klassen:
F1-Score Durchschnitt für alle Klassen: 0.26 (Min: 0.00, Max: 0.73)
Precision Durchschnitt für alle Klassen: 0.28 (Min: 0.00, Max: 0.82)
Recall Durchschnitt für alle Klassen: 0.28 (Min: 0.00, Max: 0.84)
ROUGE-1 Scores für Klasse Political speech:
F1-Score Durchschnitt für Klasse Political speech: 0.28 (Min: 0.00, Max: 0.59)
Precision Durchschnitt für Klasse Political speech: 0.29 (Min: 0.00, Max: 0.66)
Recall Durchschnitt für Klasse Political speech: 0.31 (Min: 0.00, Max: 0.53)
ROUGE-1 Scores für Klasse News:
F1-Score Durchschnitt für Klasse News: 0.22 (Min: 0.00, Max: 0.43)
Precision Durchschnitt für Klasse News: 0.23 (Min: 0.00, Max: 0.56)
Recall Durchschnitt für Klasse News: 0.23 (Min: 0.00, Max: 0.53)
ROUGE-1 Scores für Klasse Jurisdiction:
F1-Score Durchschnitt für Klasse Jurisdiction: 0.29 (Min: 0.03, Max: 0.53)
Precision Durchschnitt für Klasse Jurisdiction: 0.29 (Min: 0.02, Max: 0.67)
Recall Durchschnitt für Klasse Jurisdiction: 0.32 (Min: 0.11,