In [None]:
# https://spacy.io/usage
!python -m spacy download en_core_web_sm

sources used in this file: 
- https://iq.opengenus.org/latent-semantic-analysis-for-text-summarization/
- https://towardsdatascience.com/document-summarization-using-latent-semantic-indexing-b747ef2d2af6
- https://github.com/luisfredgs/LSA-Text-Summarization

In [2]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")



In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [4]:
# Function for Tokenization, Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
def preprocess_text(text):
    # Tokenization
    doc = nlp(text)
    # Source: https://stackoverflow.com/questions/64185831/am-i-missing-the-preprocessing-function-in-spacys-lemmatization
    # Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
    preprocessed_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Return the preprocessed tokens as text
    preprocessed_text = " ".join(preprocessed_tokens)
    return preprocessed_text

In [6]:
def lsa_summarizer(text, compression_rate):
    # Split text sentence
    sentences = sent_tokenize(text) # https://www.guru99.com/tokenize-words-sentences-nltk.html
    sort_dict = {}
    for i in range(len(sentences)):
        sort_dict[i] = sentences[i]

    preprocessed_sentences = []
    for sentence in sentences:
        preprocessed_sentences.append(preprocess_text(sentence))

    # Text Vectorization
    vectorizer = CountVectorizer()
    term_document_matrix = vectorizer.fit_transform(preprocessed_sentences)
    # LSA-Model
    num_components = max(int(len(preprocessed_sentences) * compression_rate), 1)
    lsa_model = TruncatedSVD(n_components=num_components)
    lsa_matrix = lsa_model.fit_transform(term_document_matrix)

    # Ranking sentences
    sentence_scores = lsa_matrix.sum(axis=1)
    print(sentence_scores)
    
    # Print the sentence scores
    ranking = sorted(range(len(sentence_scores)), key=lambda x: sentence_scores[x], reverse=True)

    ranking_texts = [sort_dict[index] for index in ranking]

    max_words = len(word_tokenize(text)) * (1-compression_rate)
    words=0
    chosen_texts = []
    for text in ranking_texts:
        if words<=max_words:
            sen_length = len(word_tokenize(text)) # https://www.guru99.com/tokenize-words-sentences-nltk.html
            words += sen_length
            chosen_texts.append(text)
    
    chosen_dict = {}
    for i in range (len(sort_dict)):
        for text in chosen_texts:
            if sort_dict[i] == text:
                chosen_dict[i] = text

    summary_sentences = chosen_dict.values()
    summary = ' '.join(summary_sentences)

    return summary

In [7]:
# Example text
text = '''The World Cup co-host looked to have earned itself a route back into the game midway through the second half when Jacqui Hand’s looping header floated over a despairing Olivia McDaniel in goal, but it was later ruled out by the video assistant referee (VAR) for offside.

The World Cup debutant was able to withstand New Zealand pressure, including a truly remarkable diving save from McDaniel in added time at the end of the game, to earn a historic victory, sparking scenes of jubilant celebrations.

“I literally can’t put it into words,” Bolden said afterwards. “This has been a dream of mine as a little kid to just be here at the World Cup, let alone even score.

“I couldn’t have done it without my teammates, the staff, the fans, the Philippines as a whole. It’s just amazing right now to feel this win and this energy in this stadium right now, so it’s just amazing.”
'''
summary = lsa_summarizer(text, 0.5)

print(summary)

[ 1.37899731e+00  7.06378646e+00 -3.99322995e-16  7.28730785e-01
 -5.06539796e-18  3.46410162e+00]
The World Cup co-host looked to have earned itself a route back into the game midway through the second half when Jacqui Hand’s looping header floated over a despairing Olivia McDaniel in goal, but it was later ruled out by the video assistant referee (VAR) for offside. The World Cup debutant was able to withstand New Zealand pressure, including a truly remarkable diving save from McDaniel in added time at the end of the game, to earn a historic victory, sparking scenes of jubilant celebrations. It’s just amazing right now to feel this win and this energy in this stadium right now, so it’s just amazing.”


In [23]:
from rouge import Rouge
import pandas as pd

def evaluate_rouge_scores(df):
    rouge = Rouge()

    rouge_scores = {
        'all_classes': {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }
    }

    # Get unique classes
    classes = df['Class'].unique()

    for cls in classes:
        rouge_scores[cls] = {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }

    for _, row in df.iterrows():
        text = row['Text']
        reference_summary = row['Summary']
        cls = row['Class']


        # Generate summary with compression rate
        compression_rate = len(reference_summary.split()) / len(text.split())
        print(f'compression_rate: {compression_rate}')
        summary = lsa_summarizer(text, compression_rate=compression_rate)

        # calculate rouge 1
        rouge_scores_all = rouge.get_scores(summary, reference_summary)[0]
        rouge_scores_cls = rouge_scores[cls]

        # Add rouge score values
        rouge_scores_all_cls = rouge_scores_cls['rouge-1']
        rouge_scores_all_cls['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_cls['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_cls['r'].append(rouge_scores_all['rouge-1']['r'])

        # Add rouge score values for all classes
        rouge_scores_all_all = rouge_scores['all_classes']['rouge-1']
        rouge_scores_all_all['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_all['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_all['r'].append(rouge_scores_all['rouge-1']['r'])

    return rouge_scores


In [24]:
# Data
import pandas as pd

# Read Dataframe
df = pd.read_csv('../data/Result/reference_summaries_dataset.csv')

# Show Dataframe
df.head()

Unnamed: 0,Text,Class,Summary
0,"In other words, the notion of a category of hy...",Jurisdiction,"In other words, the notion of a category of hy..."
1,(5) An offence the sentence for which is impos...,Jurisdiction,(2) If at the time when that offence was commi...
2,The Court of Justice rejected the contention. ...,Jurisdiction,It noted in the first place that the expressio...
3,In London Underground Ltd v Edwards (No 2) [19...,Jurisdiction,In London Underground Ltd v Edwards (No 2) [19...
4,"On the question of jury tampering (to which, m...",Jurisdiction,These are often difficult to detect and may ro...


In [25]:
# make sure all strings
df['Text'] = df['Text'].astype(str)
df['Summary'] = df['Summary'].astype(str)

In [26]:
rouge_scores = evaluate_rouge_scores(df)

# calculate ROUGE-1-Scores for all classes
print("ROUGE-1 Scores für alle Klassen:")
print(rouge_scores['all_classes']['rouge-1'])

# calculate ROUGE-1-Scores for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        print(scores['rouge-1'])

compression_rate: 0.6592
[ 5.54245038  6.28097801  6.82041718  6.2008054  -0.8391405   1.47472066
 -2.69980698  1.45583901 -0.35993879  3.80603498  2.24296279  5.41762243
  8.79005814  1.95766349  5.5390182 ]
compression_rate: 0.33086876155268025
[2.41258332 2.21119575 2.80281145 3.43268911 1.52852431 0.23978242
 0.64236739 9.19565383 6.67866908 1.06216226 8.83090489 4.86779308]
compression_rate: 0.6416938110749185
[ 7.75380853e-01  6.64086341e+00  2.63260158e-01  4.09295053e+00
  2.27393188e+00  3.95888078e-17 -1.53601793e+00 -4.42296465e+00
 -2.50088665e+00  1.54992125e+00  3.55281845e-17  3.27678045e-01
  2.95942846e+00  3.20147380e+00  5.12527667e-01  3.30563950e-01
  7.94134946e-02  2.27278885e+00  1.20666131e+01 -3.72728614e+00
  2.87320303e+00  3.21530621e+00]
compression_rate: 0.5772357723577236
[ 3.22884978e+00  8.20515754e+00  1.79445250e+00  6.99936639e-01
  3.00224883e+00  7.82755604e+00 -1.84751764e-03 -3.77573207e-01
 -9.66235846e-01 -7.75634783e-02 -6.49268047e-01  2.274

In [27]:
# calculate avg, max, min for all classes
print("ROUGE-1 Scores für alle Klassen:")
all_classes_scores = rouge_scores['all_classes']['rouge-1']
all_classes_f1_avg = sum(all_classes_scores['f']) / len(all_classes_scores['f'])
all_classes_p_avg = sum(all_classes_scores['p']) / len(all_classes_scores['p'])
all_classes_r_avg = sum(all_classes_scores['r']) / len(all_classes_scores['r'])
all_classes_f1_min = min(all_classes_scores['f'])
all_classes_p_min = min(all_classes_scores['p'])
all_classes_r_min = min(all_classes_scores['r'])
all_classes_f1_max = max(all_classes_scores['f'])
all_classes_p_max = max(all_classes_scores['p'])
all_classes_r_max = max(all_classes_scores['r'])
print(f"F1-Score Durchschnitt für alle Klassen: {all_classes_f1_avg:.2f} (Min: {all_classes_f1_min:.2f}, Max: {all_classes_f1_max:.2f})")
print(f"Precision Durchschnitt für alle Klassen: {all_classes_p_avg:.2f} (Min: {all_classes_p_min:.2f}, Max: {all_classes_p_max:.2f})")
print(f"Recall Durchschnitt für alle Klassen: {all_classes_r_avg:.2f} (Min: {all_classes_r_min:.2f}, Max: {all_classes_r_max:.2f})")

# calculate avg, max, min for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        class_scores = scores['rouge-1']
        class_f1_avg = sum(class_scores['f']) / len(class_scores['f'])
        class_p_avg = sum(class_scores['p']) / len(class_scores['p'])
        class_r_avg = sum(class_scores['r']) / len(class_scores['r'])
        class_f1_min = min(class_scores['f'])
        class_p_min = min(class_scores['p'])
        class_r_min = min(class_scores['r'])
        class_f1_max = max(class_scores['f'])
        class_p_max = max(class_scores['p'])
        class_r_max = max(class_scores['r'])
        print(f"F1-Score Durchschnitt für Klasse {cls}: {class_f1_avg:.2f} (Min: {class_f1_min:.2f}, Max: {class_f1_max:.2f})")
        print(f"Precision Durchschnitt für Klasse {cls}: {class_p_avg:.2f} (Min: {class_p_min:.2f}, Max: {class_p_max:.2f})")
        print(f"Recall Durchschnitt für Klasse {cls}: {class_r_avg:.2f} (Min: {class_r_min:.2f}, Max: {class_r_max:.2f})")


ROUGE-1 Scores für alle Klassen:
F1-Score Durchschnitt für alle Klassen: 0.55 (Min: 0.16, Max: 0.91)
Precision Durchschnitt für alle Klassen: 0.71 (Min: 0.16, Max: 1.00)
Recall Durchschnitt für alle Klassen: 0.59 (Min: 0.09, Max: 1.00)
ROUGE-1 Scores für Klasse Jurisdiction:
F1-Score Durchschnitt für Klasse Jurisdiction: 0.59 (Min: 0.29, Max: 0.91)
Precision Durchschnitt für Klasse Jurisdiction: 0.70 (Min: 0.24, Max: 1.00)
Recall Durchschnitt für Klasse Jurisdiction: 0.65 (Min: 0.17, Max: 1.00)
ROUGE-1 Scores für Klasse Political speech:
F1-Score Durchschnitt für Klasse Political speech: 0.55 (Min: 0.17, Max: 0.81)
Precision Durchschnitt für Klasse Political speech: 0.71 (Min: 0.23, Max: 1.00)
Recall Durchschnitt für Klasse Political speech: 0.59 (Min: 0.09, Max: 1.00)
ROUGE-1 Scores für Klasse Literature:
F1-Score Durchschnitt für Klasse Literature: 0.55 (Min: 0.16, Max: 0.83)
Precision Durchschnitt für Klasse Literature: 0.72 (Min: 0.24, Max: 1.00)
Recall Durchschnitt für Klasse Lite