In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from utils import calculate_scores, sum_metrices
from datasets import load_dataset

  from tqdm.autonotebook import tqdm, trange


# Load data

In [3]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(10))
data = pd.DataFrame(dataset)

# GUSUM summarizer

In [4]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    return embeddings.cpu().numpy()

def compute_sentence_features(sentences):
    """
    Computing Sentence Features
    """
    features = []
    max_length = max(len(word_tokenize(sentence)) for sentence in sentences)
    for idx, sentence in enumerate(sentences):
        length_score = len(word_tokenize(sentence)) / max_length # No. W ord in Si No.W ord in Longest Sentence (1)
        position_score = 1 if idx == 0 or idx == len(sentences) - 1 else (len(sentences) - idx) / len(sentences) # The first and the last sentence of a document are typically important (2)
        words = word_tokenize(sentence)
        proper_nouns = sum(1 for word in words if word[0].isupper())
        proper_noun_score = proper_nouns / len(words) if words else 0 # (3)
        numerical_tokens = sum(1 for word in words if word.isdigit())
        numerical_token_score = numerical_tokens / len(words) if words else 0 # (4)

        feature_score = length_score + position_score + proper_noun_score + numerical_token_score
        features.append(feature_score)
    return features

def create_sentence_graph(embeddings):
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            graph[i, j] = similarity
            graph[j, i] = similarity
    return graph

def rank_sentences(sentences, graph, features):
    centrality_scores = np.sum(graph, axis=1)
    ranks = [features[i] * centrality_scores[i] for i in range(len(sentences))]
    ranked_indices = np.argsort(ranks)[::-1]  # Sort in descending order
    return [sentences[i] for i in ranked_indices]

def extract_summary(ranked_sentences, k=3):
    return ' '.join(ranked_sentences[:k])

In [5]:
# 1. Preprocess the dataset
train_data = preprocess_text(data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
data['embeddings'] = data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Compute Sentence Features
data['sentence_features'] = data['sentences'].apply(compute_sentence_features)

# 4. Create Sentence Graph
data['graph'] = data['embeddings'].apply(create_sentence_graph)

# 5. Rank Sentences Based on Centrality and Features
data['ranked_sentences'] = data.apply(
    lambda row: rank_sentences(row['sentences'], row['graph'], row['sentence_features']), axis=1
)

# 6. Extract Summary
data['gusum_summary'] = data['ranked_sentences'].apply(lambda sents: extract_summary(sents, k=3))

# Caluate metrices

In [8]:
df = calculate_scores(data, 'gusum_summary', 'highlights')
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'GUMSUM.csv')

In [7]:
data

Unnamed: 0,article,highlights,id,sentences,embeddings,sentence_features,graph,ranked_sentences,gusum_summary,rouge_scores
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4,"[LONDON, England (Reuters) -- Harry Potter sta...","[[0.011218078, 0.05443934, 0.008227166, -0.032...","[2.1374999999999997, 2.1458333333333335, 1.863...","[[0.0, 0.6444466710090637, 0.2955407500267029,...","[Daniel Radcliffe as Harry Potter in ""Harry Po...","Daniel Radcliffe as Harry Potter in ""Harry Pot...","{'rouge1': (0.6923076923076923, 0.226890756302..."
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9,[Editor's note: In our Behind the Scenes serie...,"[[-0.047513135, -0.004193974, 0.014303041, 0.1...","[1.8130921619293712, 2.0129596501689524, 1.722...","[[0.0, 0.09076032042503357, 0.1470263600349426...","[Here, Soledad O'Brien takes users inside a ja...","Here, Soledad O'Brien takes users inside a jai...","{'rouge1': (0.3877551020408163, 0.2, 0.2638888..."
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37,"[MINNEAPOLIS, Minnesota (CNN) -- Drivers who w...","[[0.027638346, 0.05944389, 0.06226873, 0.07558...","[1.6683716965046889, 1.748725296570241, 1.2657...","[[0.0, 0.5616572499275208, 0.33688774704933167...","[""So I stayed in my car until the cars quit fa...","""So I stayed in my car until the cars quit fal...","{'rouge1': (0.36585365853658536, 0.16853932584..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88,[WASHINGTON (CNN) -- Doctors removed five smal...,"[[0.02162505, 0.066371895, 0.10687812, -0.0043...","[2.1191756272401436, 2.041666666666667, 1.2777...","[[0.0, 0.6739037036895752, 0.1325961947441101,...",[WASHINGTON (CNN) -- Doctors removed five smal...,WASHINGTON (CNN) -- Doctors removed five small...,"{'rouge1': (0.4166666666666667, 0.129870129870..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a,[(CNN) -- The National Football League has in...,"[[-0.0826094, 0.04766336, -0.02958666, 0.04218...","[1.894419306184012, 1.5468883205456094, 1.2751...","[[0.0, 0.553372323513031, 0.03839101642370224,...",[Vick said he would plead guilty to one count ...,Vick said he would plead guilty to one count o...,"{'rouge1': (0.34146341463414637, 0.15555555555..."
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...","Parents beam with pride, can't stop from smili...",a1ebb8bb4d370a1fdf28769206d572be60642d70,"[BAGHDAD, Iraq (CNN) -- Dressed in a Superman ...","[[0.014583753, 0.06445515, -0.046164952, -0.00...","[1.797113752122241, 1.4475209080047788, 1.3290...","[[0.0, 0.23196357488632202, 0.5103440284729004...","[Nearby, his parents talked about the new futu...","Nearby, his parents talked about the new futur...","{'rouge1': (0.3023255813953488, 0.116071428571..."
6,"BAGHDAD, Iraq (CNN) -- The women are too afrai...","Aid workers: Violence, increased cost of livin...",7c0e61ac829a3b3b653e2e3e7536cc4881d1f264,"[BAGHDAD, Iraq (CNN) -- The women are too afra...","[[0.08501559, -0.03930717, 0.0004374161, 0.081...","[1.703478260869565, 1.6421626712849027, 1.3796...","[[0.0, 0.05817382037639618, 0.1769544035196304...",[I have to do anything that I can to preserve ...,I have to do anything that I can to preserve m...,"{'rouge1': (0.3829787234042553, 0.189473684210..."
7,"BOGOTA, Colombia (CNN) -- A key rebel commande...",Tomas Medina Caracas was a fugitive from a U.S...,f0d73bdab711763e745cdc75850861c9018f235d,"[BOGOTA, Colombia (CNN) -- A key rebel command...","[[0.018569957, -0.057575922, -0.078808166, -0....","[1.9754055468341183, 1.642857142857143, 2.0567...","[[0.0, 0.49595144391059875, 0.5196391344070435...","[Tomas Medina Caracas, known popularly as ""El ...","Tomas Medina Caracas, known popularly as ""El N...","{'rouge1': (0.8333333333333334, 0.402298850574..."
8,WASHINGTON (CNN) -- White House press secretar...,"President Bush says Tony Snow ""will battle can...",5e22bbfc7232418b8d2dd646b952e404df5bd048,[WASHINGTON (CNN) -- White House press secreta...,"[[-0.030546034, 0.028852273, 0.075175665, 0.00...","[2.302325581395349, 1.7167899929527837, 1.5614...","[[0.0, 0.8266218304634094, 0.5004574656486511,...",[WASHINGTON (CNN) -- White House press secreta...,WASHINGTON (CNN) -- White House press secretar...,"{'rouge1': (0.6363636363636364, 0.295774647887..."
9,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...,613d6311ec2c1985bd44707d1796d275452fe156,[(CNN) -- Police and FBI agents are investigat...,"[[-0.0131788235, 0.06419857, -0.013403098, -0....","[2.1679197994987467, 1.6927380952380953, 1.963...","[[0.0, 0.529964804649353, 0.43643730878829956,...",[(CNN) -- Police and FBI agents are investigat...,(CNN) -- Police and FBI agents are investigati...,"{'rouge1': (0.5, 0.15217391304347827, 0.233333..."


In [10]:
mean_scores

{'rouge1': {'precision': 0.4859047808682481,
  'recall': 0.2036648291858373,
  'fmeasure': 0.28492687398909694},
 'rouge2': {'precision': 0.18632431183266282,
  'recall': 0.07537800003619965,
  'fmeasure': 0.10639779907908524},
 'rougeL': {'precision': 0.3235557096090743,
  'recall': 0.1326671774145243,
  'fmeasure': 0.18667836754501602},
 'rougeLsum': {'precision': 0.3947209616201103,
  'recall': 0.1620626518440607,
  'fmeasure': 0.22799460334018748}}