In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from scipy import spatial
from scipy import sparse
import networkx as nx
import re

In [10]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

Source: https://medium.com/data-science-in-your-pocket/text-summarization-using-textrank-in-nlp-4bce52c5b390

In [11]:
# Function for Tokenization, Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
def preprocess_text(text):
    # Tokenization
    doc = nlp(text)
    # Source: https://stackoverflow.com/questions/64185831/am-i-missing-the-preprocessing-function-in-spacys-lemmatization
    # Remove stopwords, Lowercasing, Lemmatization, Remove punctuation
    preprocessed_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    # Return the preprocessed tokens as text
    preprocessed_text = " ".join(preprocessed_tokens)
    return preprocessed_text

In [33]:
def textrank_summarizer(text, compression_rate):
    # Split text sentence
    sentences = sent_tokenize(text) # https://www.guru99.com/tokenize-words-sentences-nltk.html
    sort_dict = {}
    for i in range(len(sentences)):
        sort_dict[i] = sentences[i]

    preprocessed_sentences = []
    for sentence in sentences:
        preprocessed_sentences.append(preprocess_text(sentence))

    sentence_tokens = []
    for sentence in preprocessed_sentences:
        sentence_tokens.append(word_tokenize(sentence))

    w2v=Word2Vec(sentence_tokens,vector_size=1,min_count=1, epochs=1000)
    sentence_embeddings = [[w2v.wv[word][0] for word in words] for words in sentence_tokens]
    max_len=max([len(tokens) for tokens in sentence_tokens])
    sentence_embeddings=[np.pad(embedding,(0,max_len-len(embedding)),'constant') for embedding in sentence_embeddings]
    similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
    for i,row_embedding in enumerate(sentence_embeddings):
        for j,column_embedding in enumerate(sentence_embeddings):
            similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)
    nx_graph = nx.from_numpy_array(similarity_matrix)
    try:
        scores = nx.pagerank(nx_graph, max_iter=2000, tol=1e-6)
    except:
        return 'Error'
    sentence_scores = list(scores.values())
    
    # Print the sentence scores
    ranking = sorted(range(len(sentence_scores)), key=lambda x: sentence_scores[x], reverse=True)

    ranking_texts = [sort_dict[index] for index in ranking]

    max_words = len(word_tokenize(text)) * (1-compression_rate)
    words=0
    chosen_texts = []
    for text in ranking_texts:
        if words<=max_words:
            sen_length = len(word_tokenize(text)) # https://www.guru99.com/tokenize-words-sentences-nltk.html
            words += sen_length
            chosen_texts.append(text)
    
    chosen_dict = {}
    for i in range (len(sort_dict)):
        for text in chosen_texts:
            if sort_dict[i] == text:
                chosen_dict[i] = text

    summary_sentences = chosen_dict.values()
    summary = ' '.join(summary_sentences)

    return summary

In [35]:
# Example text
text = '''The World Cup co-host looked to have earned itself a route back into the game midway through the second half when Jacqui Hand’s looping header floated over a despairing Olivia McDaniel in goal, but it was later ruled out by the video assistant referee (VAR) for offside.

The World Cup debutant was able to withstand New Zealand pressure, including a truly remarkable diving save from McDaniel in added time at the end of the game, to earn a historic victory, sparking scenes of jubilant celebrations.

“I literally can’t put it into words,” Bolden said afterwards. “This has been a dream of mine as a little kid to just be here at the World Cup, let alone even score.

“I couldn’t have done it without my teammates, the staff, the fans, the Philippines as a whole. It’s just amazing right now to feel this win and this energy in this stadium right now, so it’s just amazing.”
'''

# Print the Number of words of the summary with the compression rate of 0.4(40%)
summary_0 = textrank_summarizer(text, 0.4)
print(len(word_tokenize(summary_0)))

# Print the Number of words of the summary with the compression rate of 0.7(70%)
summary_1 = textrank_summarizer(text, 0.7)
print(len(word_tokenize(summary_1)))

# The compression rate changes the length of the summary

136
78


In [24]:
from rouge import Rouge
import pandas as pd

def evaluate_rouge_scores(df):
    rouge = Rouge()

    rouge_scores = {
        'all_classes': {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }
    }

    # Get unique classes
    classes = df['Class'].unique()

    for cls in classes:
        rouge_scores[cls] = {
            'rouge-1': {
                'f': [],
                'p': [],
                'r': []
            }
        }

    for _, row in df.iterrows():
        text = row['Text']
        reference_summary = row['Summary']
        cls = row['Class']


        # Generate summary with compression rate
        compression_rate = len(reference_summary.split()) / len(text.split())
        print(f'compression_rate: {compression_rate}')
        summary = textrank_summarizer(text, compression_rate=compression_rate)

        # calculate rouge 1
        if summary != 'Error':
            rouge_scores_all = rouge.get_scores(summary, reference_summary)[0]
            rouge_scores_cls = rouge_scores[cls]
        else:
            continue

        # Add rouge score values
        rouge_scores_all_cls = rouge_scores_cls['rouge-1']
        rouge_scores_all_cls['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_cls['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_cls['r'].append(rouge_scores_all['rouge-1']['r'])

        # Add rouge score values for all classes
        rouge_scores_all_all = rouge_scores['all_classes']['rouge-1']
        rouge_scores_all_all['f'].append(rouge_scores_all['rouge-1']['f'])
        rouge_scores_all_all['p'].append(rouge_scores_all['rouge-1']['p'])
        rouge_scores_all_all['r'].append(rouge_scores_all['rouge-1']['r'])

    return rouge_scores


In [25]:
# Data
import pandas as pd

# Read Dataframe
df = pd.read_csv('../data/Result/reference_summaries_dataset.csv')

# Show Dataframe
df.head()

Unnamed: 0,Text,Class,Summary
0,"In other words, the notion of a category of hy...",Jurisdiction,"In other words, the notion of a category of hy..."
1,(5) An offence the sentence for which is impos...,Jurisdiction,Schedule 15 contained a very substantial list ...
2,The Court of Justice rejected the contention. ...,Jurisdiction,It noted in the first place that the expressio...
3,In London Underground Ltd v Edwards (No 2) [19...,Jurisdiction,In London Underground Ltd v Edwards (No 2) [19...
4,"On the question of jury tampering (to which, m...",Jurisdiction,"On the question of jury tampering (to which, m..."


In [26]:
# make sure all strings
df['Text'] = df['Text'].astype(str)
df['Summary'] = df['Summary'].astype(str)

In [27]:
rouge_scores = evaluate_rouge_scores(df)

# calculate ROUGE-1-Scores for all classes
print("ROUGE-1 Scores für alle Klassen:")
print(rouge_scores['all_classes']['rouge-1'])

# calculate ROUGE-1-Scores for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        print(scores['rouge-1'])

compression_rate: 0.6592
15
[1.         0.99102962 0.74328047 0.88193256 0.4084954  0.71746463
 0.80386686 0.87780148 0.74068278 0.65555388 0.58421248 0.80123061
 0.88360268 0.51949626 0.82038724]
compression_rate: 0.18853974121996303
12
[1.         0.8764962  0.70123887 0.53767198 0.54889226 0.67739755
 0.9842304  0.50909823 0.4843564  0.81800246 0.55232996 0.6470564 ]
compression_rate: 0.6416938110749185
22
[ 1.          0.44228774  0.8897385   0.68995512  0.54000926 -0.54150021
  0.47311547  0.6929701   0.82233065  0.64679891  0.54150021  0.63417786
  0.79833108  0.51902968  0.87493354  0.78302705  0.50398922  0.47952908
  0.24525282  0.32036301  0.31463596  0.76917857]
compression_rate: 0.7710027100271003
29
[1.         0.62417507 0.5440973  0.53995746 0.47878194 0.78454232
 0.30238724 0.62438738 0.5382899  0.54007751 0.53651673 0.47615835
 0.3020986  0.43235657 0.54849356 0.6370371  0.97575289 0.72012067
 0.62035149 0.47405359 0.92530811 0.393971   0.43600708 0.7896359
 0.5202176 

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


18
[1.         0.41909617 0.36626101 0.85994655 0.77832311 0.86011159
 0.8912214  0.98677582 0.71338701 0.57089049 0.83552074 0.61797261
 0.93454331 0.52393794 0.96134567 0.9167006  0.86264926 0.76060361]
compression_rate: 0.5729166666666666
33
[1.         0.85859174 0.85431522 0.49521729 0.95658576 0.84897649
 0.74716514 0.76670581 0.50306177 0.67844909 0.45380768 0.72928625
 0.71894425 0.92006946 0.60457641 0.80759186 0.96145296 0.7317701
 0.75616193 0.86710489 0.82896024 0.91194081 0.5066281  0.50808334
 0.85417569 0.84506875 0.70259571 0.83197576 0.55942404 0.8917734
 0.60089195 0.50779885 0.83955353]
compression_rate: 0.8967495219885278
20
[1.         0.94566596 0.51893115 0.65467173 0.65474373 0.87436569
 0.84707552 0.55900282 0.5191884  0.77503622 0.80680662 0.8100453
 0.91268712 0.60449636 0.73749691 0.60948628 0.56037188 0.50136501
 0.69908935 0.96416736]
compression_rate: 0.4463840399002494
26
[1.         0.5303126  0.6805408  0.61198318 0.70207798 0.94716471
 0.85027194 0.64

  dist = 1.0 - uv / np.sqrt(uu * vv)


25
[1.         0.73493373 0.95523363 0.59292459 0.73645216 0.96697438
 0.90608543 0.86799467 0.73785913 0.88055187 0.67555946 0.79464287
 0.79098278 0.75085175 0.91634744 0.8380909  0.58052438 0.82797933
 0.91834456 0.82650733 0.86414188 0.6965872  0.47966871 0.77480799
 0.87135333]
compression_rate: 0.8476331360946746
18
[1.         0.67681175 0.37142494 0.56159461 0.83743477 0.84816462
 0.96731263 0.52027935 0.55409187 0.58267993 0.89567244 0.82627493
 0.28727773 0.70113111 0.44904098 0.93670148 0.44897956 0.69871545]
compression_rate: 0.657487091222031
18
[1.         0.57910913 0.82416409 0.58318466 0.44302672 0.60061103
 0.82993096 0.39428037 0.7933659  0.63922441 0.55766124 0.6271714
 0.64433885 0.43425447 0.61442775 0.58322847 0.47968826 0.22668293]
compression_rate: 0.19753086419753085
29
[1.         0.71519673 0.90792936 0.91033787 0.95148569 0.99376166
 0.90406889 0.74806637 0.62822735 0.62074423 0.62896347 0.92578763
 0.99572223 0.68155205 0.84806567 0.76674438 0.72134656 0.7

In [28]:
# calculate avg, max, min for all classes
print("ROUGE-1 Scores für alle Klassen:")
all_classes_scores = rouge_scores['all_classes']['rouge-1']
all_classes_f1_avg = sum(all_classes_scores['f']) / len(all_classes_scores['f'])
all_classes_p_avg = sum(all_classes_scores['p']) / len(all_classes_scores['p'])
all_classes_r_avg = sum(all_classes_scores['r']) / len(all_classes_scores['r'])
all_classes_f1_min = min(all_classes_scores['f'])
all_classes_p_min = min(all_classes_scores['p'])
all_classes_r_min = min(all_classes_scores['r'])
all_classes_f1_max = max(all_classes_scores['f'])
all_classes_p_max = max(all_classes_scores['p'])
all_classes_r_max = max(all_classes_scores['r'])
print(f"F1-Score Durchschnitt für alle Klassen: {all_classes_f1_avg:.2f} (Min: {all_classes_f1_min:.2f}, Max: {all_classes_f1_max:.2f})")
print(f"Precision Durchschnitt für alle Klassen: {all_classes_p_avg:.2f} (Min: {all_classes_p_min:.2f}, Max: {all_classes_p_max:.2f})")
print(f"Recall Durchschnitt für alle Klassen: {all_classes_r_avg:.2f} (Min: {all_classes_r_min:.2f}, Max: {all_classes_r_max:.2f})")

# calculate avg, max, min for each class
for cls, scores in rouge_scores.items():
    if cls != 'all_classes':
        print(f"ROUGE-1 Scores für Klasse {cls}:")
        class_scores = scores['rouge-1']
        class_f1_avg = sum(class_scores['f']) / len(class_scores['f'])
        class_p_avg = sum(class_scores['p']) / len(class_scores['p'])
        class_r_avg = sum(class_scores['r']) / len(class_scores['r'])
        class_f1_min = min(class_scores['f'])
        class_p_min = min(class_scores['p'])
        class_r_min = min(class_scores['r'])
        class_f1_max = max(class_scores['f'])
        class_p_max = max(class_scores['p'])
        class_r_max = max(class_scores['r'])
        print(f"F1-Score Durchschnitt für Klasse {cls}: {class_f1_avg:.2f} (Min: {class_f1_min:.2f}, Max: {class_f1_max:.2f})")
        print(f"Precision Durchschnitt für Klasse {cls}: {class_p_avg:.2f} (Min: {class_p_min:.2f}, Max: {class_p_max:.2f})")
        print(f"Recall Durchschnitt für Klasse {cls}: {class_r_avg:.2f} (Min: {class_r_min:.2f}, Max: {class_r_max:.2f})")

ROUGE-1 Scores für alle Klassen:
F1-Score Durchschnitt für alle Klassen: 0.49 (Min: 0.12, Max: 0.79)
Precision Durchschnitt für alle Klassen: 0.64 (Min: 0.14, Max: 1.00)
Recall Durchschnitt für alle Klassen: 0.52 (Min: 0.06, Max: 1.00)
ROUGE-1 Scores für Klasse Jurisdiction:
F1-Score Durchschnitt für Klasse Jurisdiction: 0.50 (Min: 0.16, Max: 0.79)
Precision Durchschnitt für Klasse Jurisdiction: 0.69 (Min: 0.21, Max: 1.00)
Recall Durchschnitt für Klasse Jurisdiction: 0.50 (Min: 0.09, Max: 1.00)
ROUGE-1 Scores für Klasse Political speech:
F1-Score Durchschnitt für Klasse Political speech: 0.49 (Min: 0.12, Max: 0.79)
Precision Durchschnitt für Klasse Political speech: 0.65 (Min: 0.23, Max: 1.00)
Recall Durchschnitt für Klasse Political speech: 0.53 (Min: 0.06, Max: 1.00)
ROUGE-1 Scores für Klasse Literature:
F1-Score Durchschnitt für Klasse Literature: 0.47 (Min: 0.15, Max: 0.70)
Precision Durchschnitt für Klasse Literature: 0.65 (Min: 0.19, Max: 1.00)
Recall Durchschnitt für Klasse Lite