In [3]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from utils import calculate_scores, sum_metrices, calculate_mean
from datasets import load_dataset
import os

  from tqdm.autonotebook import tqdm, trange


# Load data

In [5]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(10000))
data = pd.DataFrame(dataset)

# GUSUM summarizer

In [7]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    return embeddings.cpu().numpy()

def compute_sentence_features(sentences):
    """
    Compute Sentence Features
    """
    features = []
    max_length = max(len(word_tokenize(sentence)) for sentence in sentences)
    for idx, sentence in enumerate(sentences):
        length_score = len(word_tokenize(sentence)) / max_length # No. W ord in Si No.W ord in Longest Sentence (1)
        position_score = 1 if idx == 0 or idx == len(sentences) - 1 else (len(sentences) - idx) / len(sentences) # The first and the last sentence of a document are typically important (2)
        words = word_tokenize(sentence)
        proper_nouns = sum(1 for word in words if word[0].isupper())
        proper_noun_score = proper_nouns / len(words) if words else 0 # (3)
        numerical_tokens = sum(1 for word in words if word.isdigit())
        numerical_token_score = numerical_tokens / len(words) if words else 0 # (4)

        feature_score = length_score + position_score + proper_noun_score + numerical_token_score
        features.append(feature_score)
    return features

def create_sentence_graph(embeddings):
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            graph[i, j] = similarity
            graph[j, i] = similarity
    return graph

def rank_sentences(sentences, graph, features):
    centrality_scores = np.sum(graph, axis=1)
    ranks = [features[i] * centrality_scores[i] for i in range(len(sentences))]
    ranked_indices = np.argsort(ranks)[::-1]  # Sort desc
    return [sentences[i] for i in ranked_indices]

def extract_summary(ranked_sentences, k=3):
    return ' '.join(ranked_sentences[:k])

In [None]:
# 1. Preprocess the dataset
train_data = preprocess_text(data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
data['embeddings'] = data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Compute Sentence Features
data['sentence_features'] = data['sentences'].apply(compute_sentence_features)

# 4. Create Sentence Graph
data['graph'] = data['embeddings'].apply(create_sentence_graph)

# 5. Rank Sentences Based on Centrality and Features
data['ranked_sentences'] = data.apply(lambda row: rank_sentences(row['sentences'], row['graph'], row['sentence_features']), axis=1)

# 6. Extract Summary
data['gusum_summary'] = data['ranked_sentences'].apply(lambda sents: extract_summary(sents, k=3))

# Caluate metrices

In [24]:
df = calculate_scores(data, 'gusum_summary', 'highlights')
directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'GUSUM_3.csv'))
#mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'GUMSUM_l1.csv')

In [25]:
df.head()

Unnamed: 0,article,highlights,id,sentences,embeddings,sentence_features,graph,ranked_sentences,gusum_summary,rouge_scores
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4,"[LONDON, England (Reuters) -- Harry Potter sta...","[[0.011218078, 0.05443934, 0.008227166, -0.032...","[2.1374999999999997, 2.1458333333333335, 1.863...","[[0.0, 0.6444466710090637, 0.2955407500267029,...","[Daniel Radcliffe as Harry Potter in ""Harry Po...","Daniel Radcliffe as Harry Potter in ""Harry Pot...","{'rouge1': (0.48717948717948717, 0.44186046511..."
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9,[Editor's note: In our Behind the Scenes serie...,"[[-0.047513135, -0.004193974, 0.014303041, 0.1...","[1.8130921619293712, 2.0129596501689524, 1.722...","[[0.0, 0.09076032042503357, 0.1470263600349426...","[Here, Soledad O'Brien takes users inside a ja...","Here, Soledad O'Brien takes users inside a jai...","{'rouge1': (0.30612244897959184, 0.42857142857..."
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37,"[MINNEAPOLIS, Minnesota (CNN) -- Drivers who w...","[[0.027638346, 0.05944389, 0.06226873, 0.07558...","[1.6683716965046889, 1.748725296570241, 1.2657...","[[0.0, 0.5616572499275208, 0.33688774704933167...","[""So I stayed in my car until the cars quit fa...","""So I stayed in my car until the cars quit fal...","{'rouge1': (0.24390243902439024, 0.22222222222..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88,[WASHINGTON (CNN) -- Doctors removed five smal...,"[[0.02162505, 0.066371895, 0.10687812, -0.0043...","[2.1191756272401436, 2.041666666666667, 1.2777...","[[0.0, 0.6739037036895752, 0.1325961947441101,...",[WASHINGTON (CNN) -- Doctors removed five smal...,WASHINGTON (CNN) -- Doctors removed five small...,"{'rouge1': (0.3333333333333333, 0.347826086956..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a,[(CNN) -- The National Football League has in...,"[[-0.0826094, 0.04766336, -0.02958666, 0.04218...","[1.894419306184012, 1.5468883205456094, 1.2751...","[[0.0, 0.553372323513031, 0.03839101642370224,...",[Vick said he would plead guilty to one count ...,Vick said he would plead guilty to one count o...,"{'rouge1': (0.17073170731707318, 0.15909090909..."


In [26]:
mean_scores

{'rouge1': {'precision': {'mean': 0.2659518170217687,
   'std': 0.11824426217869947},
  'recall': {'mean': 0.3372090696670768, 'std': 0.16030237450778773},
  'fmeasure': {'mean': 0.2889083373310017, 'std': 0.12359901807264481}},
 'rouge2': {'precision': {'mean': 0.09061609468753651,
   'std': 0.09404416351785494},
  'recall': {'mean': 0.12172200066865466, 'std': 0.13817986436778332},
  'fmeasure': {'mean': 0.1003756388720454, 'std': 0.10416603751310029}},
 'rougeL': {'precision': {'mean': 0.18490166033845368,
   'std': 0.09912902965118185},
  'recall': {'mean': 0.2375618156746447, 'std': 0.14138525712370906},
  'fmeasure': {'mean': 0.20158810137786126, 'std': 0.10603219972052907}},
 'rougeLsum': {'precision': {'mean': 0.22385536977225554,
   'std': 0.10884894186975048},
  'recall': {'mean': 0.28618626381131185, 'std': 0.153100041239931},
  'fmeasure': {'mean': 0.24380029371833808, 'std': 0.11593141830546562}}}