In [None]:
# https://spacy.io/usage
!python -m spacy download en_core_web_sm

sources used in this file: 
- https://iq.opengenus.org/latent-semantic-analysis-for-text-summarization/
- https://towardsdatascience.com/document-summarization-using-latent-semantic-indexing-b747ef2d2af6
- https://github.com/luisfredgs/LSA-Text-Summarization

In [6]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [8]:
# Function for Tokenization, Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
def preprocess_text(text):
    # Tokenization
    doc = nlp(text)

    # Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
    preprocessed_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Return the preprocessed tokens as text
    preprocessed_text = " ".join(preprocessed_tokens)
    return preprocessed_text

In [14]:
def lsa_summarizer(text, compression_rate):
    # Split text sentence
    sentences = sent_tokenize(text) # https://www.guru99.com/tokenize-words-sentences-nltk.html
    sort_dict = {}
    for i in range(len(sentences)):
        sort_dict[i] = sentences[i]

    # Text Vectorization
    vectorizer = CountVectorizer()
    term_document_matrix = vectorizer.fit_transform(sentences)

    # LSA-Model
    num_components = max(int(len(sentences) * compression_rate), 1)
    lsa_model = TruncatedSVD(n_components=num_components)
    lsa_matrix = lsa_model.fit_transform(term_document_matrix)

    # Ranking sentences
    sentence_scores = lsa_matrix.sum(axis=1)
    ranking = sorted(range(len(sentence_scores)), key=lambda x: sentence_scores[x], reverse=True)

    ranking_texts = [sort_dict[index] for index in ranking]

    max_words = len(word_tokenize(text)) * (1-compression_rate)
    words=0
    chosen_texts = []
    for text in ranking_texts:
        if words<=max_words:
            sen_length = len(word_tokenize(text)) # https://www.guru99.com/tokenize-words-sentences-nltk.html
            words += sen_length
            chosen_texts.append(text)
    
    chosen_dict = {}
    for i in range (len(sort_dict)):
        for text in chosen_texts:
            if sort_dict[i] == text:
                chosen_dict[i] = text

    summary_sentences = chosen_dict.values()
    summary = ' '.join(summary_sentences)

    return summary

In [16]:
from rouge import Rouge
import pandas as pd

def evaluate_rouge_scores(df):
    rouge = Rouge()

    rouge_scores = {
        'all_classes': {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }
    }

    # Get unique classes
    classes = df['Class'].unique()

    for cls in classes:
        rouge_scores[cls] = {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }

    for _, row in df.iterrows():
        text = row['Text']
        reference_summary = row['Summary']
        cls = row['Class']


        # Generate summary with compression rate
        compression_rate = len(reference_summary.split()) / len(text.split())
        summary = lsa_summarizer(text, compression_rate=compression_rate)

        # calculate rouge 1
        rouge_scores_all = rouge.get_scores(summary, reference_summary)[0]
        rouge_scores_cls = rouge_scores[cls]

        # Add rouge score values
        rouge_scores_all_cls = rouge_scores_cls['rouge-1']
        rouge_scores_all_cls['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_cls['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_cls['r'].append(rouge_scores_all['rouge-1']['r'])

        # Add rouge score values for all classes
        rouge_scores_all_all = rouge_scores['all_classes']['rouge-1']
        rouge_scores_all_all['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_all['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_all['r'].append(rouge_scores_all['rouge-1']['r'])

    return rouge_scores


In [17]:
# Data
import pandas as pd

# Read Dataframe
df = pd.read_csv('../data/Result/reference_summaries_dataset.csv')

# Show Dataframe
df.head()

Unnamed: 0,Text,Class,Summary
0,I have heard nothing from the Ambassador about...,Political speech,I have heard nothing from the Ambassador about...
1,I think it is in the public interest to procee...,Political speech,I think it is in the public interest to procee...
2,The A-11 aircraft now at Edwards Air force Bas...,Political speech,"for example, one of the most important technic..."
3,It is one of the most comprehensive bills in t...,Political speech,It is one of the most comprehensive bills in t...
4,"So long as there remains a man without a job, ...",Political speech,No American conscience can be at peace while a...


In [18]:
# make sure all strings
df['Text'] = df['Text'].astype(str)
df['Summary'] = df['Summary'].astype(str)

In [19]:
rouge_scores = evaluate_rouge_scores(df)

# calculate ROUGE-1-Scores for all classes
print("ROUGE-1 Scores für alle Klassen:")
print(rouge_scores['all_classes']['rouge-1'])

# calculate ROUGE-1-Scores for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        print(scores['rouge-1'])

  self.explained_variance_ratio_ = exp_var / full_var


In [13]:
# calculate avg, max, min for all classes
print("ROUGE-1 Scores für alle Klassen:")
all_classes_scores = rouge_scores['all_classes']['rouge-1']
all_classes_f1_avg = sum(all_classes_scores['f']) / len(all_classes_scores['f'])
all_classes_p_avg = sum(all_classes_scores['p']) / len(all_classes_scores['p'])
all_classes_r_avg = sum(all_classes_scores['r']) / len(all_classes_scores['r'])
all_classes_f1_min = min(all_classes_scores['f'])
all_classes_p_min = min(all_classes_scores['p'])
all_classes_r_min = min(all_classes_scores['r'])
all_classes_f1_max = max(all_classes_scores['f'])
all_classes_p_max = max(all_classes_scores['p'])
all_classes_r_max = max(all_classes_scores['r'])
print(f"F1-Score Durchschnitt für alle Klassen: {all_classes_f1_avg:.2f} (Min: {all_classes_f1_min:.2f}, Max: {all_classes_f1_max:.2f})")
print(f"Precision Durchschnitt für alle Klassen: {all_classes_p_avg:.2f} (Min: {all_classes_p_min:.2f}, Max: {all_classes_p_max:.2f})")
print(f"Recall Durchschnitt für alle Klassen: {all_classes_r_avg:.2f} (Min: {all_classes_r_min:.2f}, Max: {all_classes_r_max:.2f})")

# calculate avg, max, min for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        class_scores = scores['rouge-1']
        class_f1_avg = sum(class_scores['f']) / len(class_scores['f'])
        class_p_avg = sum(class_scores['p']) / len(class_scores['p'])
        class_r_avg = sum(class_scores['r']) / len(class_scores['r'])
        class_f1_min = min(class_scores['f'])
        class_p_min = min(class_scores['p'])
        class_r_min = min(class_scores['r'])
        class_f1_max = max(class_scores['f'])
        class_p_max = max(class_scores['p'])
        class_r_max = max(class_scores['r'])
        print(f"F1-Score Durchschnitt für Klasse {cls}: {class_f1_avg:.2f} (Min: {class_f1_min:.2f}, Max: {class_f1_max:.2f})")
        print(f"Precision Durchschnitt für Klasse {cls}: {class_p_avg:.2f} (Min: {class_p_min:.2f}, Max: {class_p_max:.2f})")
        print(f"Recall Durchschnitt für Klasse {cls}: {class_r_avg:.2f} (Min: {class_r_min:.2f}, Max: {class_r_max:.2f})")


ROUGE-1 Scores für alle Klassen:
F1-Score Durchschnitt für alle Klassen: 0.30 (Min: 0.00, Max: 0.72)
Precision Durchschnitt für alle Klassen: 0.34 (Min: 0.00, Max: 0.82)
Recall Durchschnitt für alle Klassen: 0.29 (Min: 0.00, Max: 0.81)
ROUGE-1 Scores für Klasse Political speech:
F1-Score Durchschnitt für Klasse Political speech: 0.33 (Min: 0.00, Max: 0.58)
Precision Durchschnitt für Klasse Political speech: 0.37 (Min: 0.00, Max: 0.66)
Recall Durchschnitt für Klasse Political speech: 0.32 (Min: 0.00, Max: 0.54)
ROUGE-1 Scores für Klasse News:
F1-Score Durchschnitt für Klasse News: 0.25 (Min: 0.00, Max: 0.50)
Precision Durchschnitt für Klasse News: 0.29 (Min: 0.00, Max: 0.62)
Recall Durchschnitt für Klasse News: 0.24 (Min: 0.00, Max: 0.42)
ROUGE-1 Scores für Klasse Jurisdiction:
F1-Score Durchschnitt für Klasse Jurisdiction: 0.33 (Min: 0.05, Max: 0.52)
Precision Durchschnitt für Klasse Jurisdiction: 0.36 (Min: 0.04, Max: 0.67)
Recall Durchschnitt für Klasse Jurisdiction: 0.33 (Min: 0.06,