In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
from scipy.spatial.distance import cosine
import networkx as nx
import community.community_louvain as louvain 
from rouge_score import rouge_scorer
from datasets import load_dataset
from utils import calculate_scores, sum_metrices

  from tqdm.autonotebook import tqdm, trange


# Load data

In [2]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(10))
train_data = pd.DataFrame(dataset)

# Summarizer

In [3]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    return embeddings.cpu().numpy()

def build_and_add_graph(row, threshold=0.5):
    """
    Builds a sentence similarity graph from embeddings. 
    """
    embeddings = row['embeddings']
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            if similarity > threshold:
                graph[i, j] = similarity
                graph[j, i] = similarity

    return graph

def compute_and_add_pagerank_scores(row):
    """
    Computes PageRank centrality scores for the graph.
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  # Convert adjacency matrix to a NetworkX graph
    pagerank_scores = nx.pagerank(nx_graph)  # Compute PageRank centrality scores
    return pagerank_scores

def add_communities(row):
    """
    Detects communities within the 'graph' field using louvain and returns the partitioning (community assignment).
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  # Convert adjacency matrix to NetworkX graph
    partition = louvain.best_partition(nx_graph)  # Detect communities
    return partition

def rank_sentences_and_extract_summary(row, max_length=3):
    """
    Ranks sentences and extracts a summary based on ranked sentences.
    """
    sentences = row['sentences']
    pagerank_scores = row['pagerank_scores']
    communities = row['communities']

    # Rank sentences based on PageRank and community information
    ranked_sentences = sorted(
        enumerate(sentences),
        key=lambda x: pagerank_scores[x[0]] * (1 + communities[x[0]]),
        reverse=True
    )
    
    # Extract the top-ranked sentences
    ranked_sentences = [sentences[i] for i, _ in ranked_sentences]
    
    # Generate summary
    summary = " ".join(ranked_sentences[:max_length])
    
    return summary

In [4]:
# 1. Preprocess the dataset
train_data = preprocess_text(train_data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
train_data['embeddings'] = train_data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Build the sentence similarity graph
train_data['graph'] = train_data.apply(build_and_add_graph, axis=1)

# 4. Compute centrality scores
train_data['pagerank_scores'] = train_data.apply(compute_and_add_pagerank_scores, axis=1)

# 5. Detect communities
train_data['communities'] = train_data.apply(lambda row: add_communities(row), axis=1)

# 6. Rank sentences and generate summaries
train_data['generated_summary'] = train_data.apply(rank_sentences_and_extract_summary, axis=1)

In [5]:
train_data.head()

Unnamed: 0,article,highlights,id,sentences,embeddings,graph,pagerank_scores,communities,generated_summary
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4,"[LONDON, England (Reuters) -- Harry Potter sta...","[[0.011218078, 0.05443934, 0.008227166, -0.032...","[[0.0, 0.6444466710090637, 0.0, 0.0, 0.0, 0.60...","{0: 0.10783999182005151, 1: 0.1347704486060076...","{0: 10, 1: 10, 2: 2, 3: 3, 4: 3, 5: 10, 6: 6, ...","Daniel Radcliffe as Harry Potter in ""Harry Pot..."
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9,[Editor's note: In our Behind the Scenes serie...,"[[-0.047513135, -0.004193974, 0.014303041, 0.1...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","{0: 0.005524862865438011, 1: 0.061891081805473...","{0: 0, 1: 2, 2: 2, 3: 2, 4: 3, 5: 3, 6: 3, 7: ...","Leifman says it's not the complete answer, but..."
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37,"[MINNEAPOLIS, Minnesota (CNN) -- Drivers who w...","[[0.027638346, 0.05944389, 0.06226873, 0.07558...","[[0.0, 0.5616572499275208, 0.0, 0.0, 0.0, 0.0,...","{0: 0.0249378067624798, 1: 0.03380655803980895...","{0: 1, 1: 1, 2: 16, 3: 3, 4: 1, 5: 4, 6: 5, 7:...","I saw a couple cars fall,"" he said. Babineau t..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88,[WASHINGTON (CNN) -- Doctors removed five smal...,"[[0.02162505, 0.066371895, 0.10687812, -0.0043...","[[0.0, 0.6739037036895752, 0.0, 0.0, 0.0, 0.0,...","{0: 0.09161459792510898, 1: 0.0670161723691332...","{0: 1, 1: 1, 2: 2, 3: 3, 4: 4, 5: 12, 6: 12, 7...",Nothing occurred that required him to take off...
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a,[(CNN) -- The National Football League has in...,"[[-0.0826094, 0.04766336, -0.02958666, 0.04218...","[[0.0, 0.553372323513031, 0.0, 0.0, 0.0, 0.517...","{0: 0.015348741742570171, 1: 0.021561453733294...","{0: 8, 1: 11, 2: 12, 3: 10, 4: 4, 5: 8, 6: 9, ...",""" In the plea deal, Vick agreed to cooperate w..."


# Calculate matrices

In [8]:
df = calculate_scores(train_data, 'generated_summary', 'highlights')
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'custom_method.csv')

In [7]:
mean_scores

{'rouge1': {'precision': 0.37731143996011834,
  'recall': 0.23053178651530729,
  'fmeasure': 0.2676014575258827},
 'rouge2': {'precision': 0.14336609425944974,
  'recall': 0.07718278925285425,
  'fmeasure': 0.0932077964657546},
 'rougeL': {'precision': 0.25370327018392536,
  'recall': 0.15479890564909976,
  'fmeasure': 0.17856202154427428},
 'rougeLsum': {'precision': 0.3319769301552046,
  'recall': 0.1979441198119291,
  'fmeasure': 0.23192511758332368}}