In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from utils import calculate_scores, sum_metrices, calculate_mean
from datasets import load_dataset
import os

# Load data

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(1000))
data = pd.DataFrame(dataset)

# GUSUM summarizer

In [3]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    return embeddings.cpu().numpy()

def compute_sentence_features(sentences):
    """
    Compute Sentence Features
    """
    features = []
    max_length = max(len(word_tokenize(sentence)) for sentence in sentences)
    for idx, sentence in enumerate(sentences):
        length_score = len(word_tokenize(sentence)) / max_length # No. W ord in Si No.W ord in Longest Sentence (1)
        position_score = 1 if idx == 0 or idx == len(sentences) - 1 else (len(sentences) - idx) / len(sentences) # The first and the last sentence of a document are typically important (2)
        words = word_tokenize(sentence)
        proper_nouns = sum(1 for word in words if word[0].isupper())
        proper_noun_score = proper_nouns / len(words) if words else 0 # (3)
        numerical_tokens = sum(1 for word in words if word.isdigit())
        numerical_token_score = numerical_tokens / len(words) if words else 0 # (4)

        feature_score = length_score + position_score + proper_noun_score + numerical_token_score
        features.append(feature_score)
    return features

def create_sentence_graph(embeddings):
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))
    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            graph[i, j] = similarity
            graph[j, i] = similarity
    return graph

def rank_sentences(sentences, graph, features):
    centrality_scores = np.sum(graph, axis=1)
    ranks = [features[i] * centrality_scores[i] for i in range(len(sentences))]
    ranked_indices = np.argsort(ranks)[::-1]  # Sort desc
    return [sentences[i] for i in ranked_indices]

def extract_summary(ranked_sentences, k=3):
    return ' '.join(ranked_sentences[:k])

In [4]:
# 1. Preprocess the dataset
train_data = preprocess_text(data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
data['embeddings'] = data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Compute Sentence Features
data['sentence_features'] = data['sentences'].apply(compute_sentence_features)

# 4. Create Sentence Graph
data['graph'] = data['embeddings'].apply(create_sentence_graph)

# 5. Rank Sentences Based on Centrality and Features
data['ranked_sentences'] = data.apply(lambda row: rank_sentences(row['sentences'], row['graph'], row['sentence_features']), axis=1)

# 6. Extract Summary
data['gusum_summary'] = data['ranked_sentences'].apply(lambda sents: extract_summary(sents, k=3))

# Caluate metrices

In [5]:
df = calculate_scores(data, 'gusum_summary', 'highlights')
directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'GUSUM_3.csv'))
#mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'GUMSUM_l1.csv')

In [None]:
df.head()

In [None]:
mean_scores