In [11]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
from scipy.spatial.distance import cosine
import networkx as nx
import community.community_louvain as louvain 
from datasets import load_dataset
from utils import calculate_scores, sum_metrices
import time
import os

# Load data

In [12]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test").select(range(1000))
train_data = pd.DataFrame(dataset)

In [13]:
train_data.shape

(1000, 3)

# Summarizer

In [22]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy()
    return embeddings

def build_and_add_graph(row, threshold=0.5):
    """
    Builds a sentence similarity graph from embeddings. 
    """
    embeddings = row['embeddings']
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            if similarity > threshold:
                graph[i, j] = similarity
                graph[j, i] = similarity

    return graph

def compute_and_add_pagerank_scores(row):
    """
    Computes PageRank centrality scores for the graph.
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  
    pagerank_scores = nx.pagerank(nx_graph)  # PageRank centrality scores
    return pagerank_scores

def add_communities(row):
    """
    Detects communities using louvain
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  
    partition = louvain.best_partition(nx_graph)  # Detect communities
    return partition

def rank_sentences_and_extract_summary(row, max_length=1):
    """
    Ranks sentences and extracts a summary based on ranked sentences.
    """
    sentences = row['sentences']
    pagerank_scores = row['pagerank_scores']
    communities = row['communities']

    # Rank sentences based on PageRank and community 
    ranked_sentences = sorted(enumerate(sentences), key=lambda x: pagerank_scores[x[0]] * (1 + communities[x[0]]), reverse=True)
    
    # Extract the top-ranked sentences
    ranked_sentences = [sentences[i] for i, _ in ranked_sentences]
    
    summary = " ".join(ranked_sentences[:max_length])
    
    return summary

In [23]:
start_time = time.time()
# 1. Preprocess the dataset
train_data = preprocess_text(train_data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
train_data['embeddings'] = train_data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Build the sentence similarity graph
train_data['graph'] = train_data.apply(build_and_add_graph, axis=1)

# 4. Compute centrality scores
train_data['pagerank_scores'] = train_data.apply(compute_and_add_pagerank_scores, axis=1)

# 5. Detect communities
train_data['communities'] = train_data.apply(lambda row: add_communities(row), axis=1)

# 6. Rank sentences and generate summaries
train_data['generated_summary'] = train_data.apply(rank_sentences_and_extract_summary, axis=1)

In [None]:
print(f'The flow takes {round(end_time/60, 3)} minutes')

The flow takes 38.567 minutes


In [14]:
train_data.head()

Unnamed: 0,article,highlights,id,sentences,embeddings,graph,pagerank_scores,communities,generated_summary,rouge_scores
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01,[(CNN)The Palestinian Authority officially bec...,"[[-0.003295988, -0.022603707, -0.048481535, -0...","[[0.0, 0.0, 0.5431975722312927, 0.509574472904...","{0: 0.04637488543563417, 1: 0.0084986129629822...","{0: 10, 1: 1, 2: 7, 3: 10, 4: 7, 5: 0, 6: 6, 7...","Later that month, the ICC opened a preliminary...","{'rouge1': (0.7058823529411765, 0.3, 0.4210526..."
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef,"[(CNN)Never mind cats having nine lives., A st...","[[0.05223946, -0.0404404, 0.02681056, 0.028128...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","{0: 0.01079136690647482, 1: 0.0719424460431654...","{0: 0, 1: 1, 2: 2, 3: 1, 4: 3, 5: 4, 6: 5, 7: ...",The veterinary hospital's Good Samaritan Fund ...,"{'rouge1': (0.3488372093023256, 0.178571428571..."
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69,[(CNN)If you've been following the news lately...,"[[-0.00025642707, 0.0105848, -0.0546053, 0.043...","[[0.0, 0.5106818079948425, 0.0, 0.0, 0.0, 0.0,...","{0: 0.04212770077738561, 1: 0.0627789035735010...","{0: 0, 1: 1, 2: 2, 3: 1, 4: 3, 5: 4, 6: 0, 7: ...","That prompted Christine Pelosi, the daughter o...","{'rouge1': (0.2857142857142857, 0.263157894736..."
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca,[(CNN)Five Americans who were monitored for th...,"[[0.05983665, 0.016785, -0.045969874, -0.01222...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6230060...","{0: 0.11834319526582975, 1: 0.0177514792911205...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 5, 6: 4, 7: ...","They are clinicians for Partners in Health, a ...","{'rouge1': (0.5, 0.3333333333333333, 0.4), 'ro..."
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f,[(CNN)A Duke student has admitted to hanging a...,"[[0.02250091, 0.07249225, 0.047614112, 0.04678...","[[0.0, 0.0, 0.0, 0.57384192943573, 0.0, 0.0, 0...","{0: 0.09132344832221988, 1: 0.0136991511615401...","{0: 3, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 6, 7: ...",This is not the Duke we're here to experience....,"{'rouge1': (0.21428571428571427, 0.25, 0.23076..."


# Calculate matrices

In [4]:
train_data.shape

(1000, 3)

In [24]:
df = calculate_scores(train_data, 'generated_summary', 'highlights')

directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'custom_method_l1.csv'))
#mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'custom_method_l1.csv')

In [17]:
df_tmp = pd.read_csv(os.path.join(directory, 'custom_method_l3.csv'))

In [18]:
df_tmp

Unnamed: 0.1,Unnamed: 0,article,highlights,id,sentences,embeddings,graph,pagerank_scores,communities,generated_summary,rouge_scores
0,0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01,['(CNN)The Palestinian Authority officially be...,[[-0.00329599 -0.02260371 -0.04848154 ... -0.0...,[[0. 0. 0.54319757 0.50957447 ...,"{0: 0.04637488543563417, 1: 0.0084986129629822...","{0: 9, 1: 1, 2: 10, 3: 9, 4: 10, 5: 3, 6: 6, 7...",The Palestinians signed the ICC's founding Rom...,"{'rouge1': Score(precision=0.7058823529411765,..."
1,1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef,"['(CNN)Never mind cats having nine lives.', 'A...",[[ 0.05223946 -0.0404404 0.02681056 ... 0.0...,[[0. 0. 0. 0. ...,"{0: 0.01079136690647482, 1: 0.0719424460431654...","{0: 0, 1: 1, 2: 2, 3: 1, 4: 3, 5: 4, 6: 5, 7: ...",The veterinary hospital's Good Samaritan Fund ...,"{'rouge1': Score(precision=0.3488372093023256,..."
2,2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69,"[""(CNN)If you've been following the news latel...",[[-0.00025643 0.0105848 -0.0546053 ... -0.0...,[[0. 0.51068181 0. ... 0. ...,"{0: 0.04212770077738561, 1: 0.0627789035735010...","{0: 0, 1: 1, 2: 2, 3: 1, 4: 3, 5: 4, 6: 0, 7: ...","That prompted Christine Pelosi, the daughter o...","{'rouge1': Score(precision=0.2857142857142857,..."
3,3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca,['(CNN)Five Americans who were monitored for t...,[[ 0.05983665 0.016785 -0.04596987 ... -0.0...,[[0. 0. 0. 0. ...,"{0: 0.11834319526582975, 1: 0.0177514792911205...","{0: 4, 1: 1, 2: 2, 3: 0, 4: 5, 5: 5, 6: 3, 7: ...","They are clinicians for Partners in Health, a ...","{'rouge1': Score(precision=0.5952380952380952,..."
4,4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f,['(CNN)A Duke student has admitted to hanging ...,[[ 0.02250091 0.07249225 0.04761411 ... -0.0...,[[0. 0. 0. 0.57384193 ...,"{0: 0.09132344832221988, 1: 0.0136991511615401...","{0: 0, 1: 1, 2: 2, 3: 0, 4: 5, 5: 5, 6: 6, 7: ...",This is not the Duke we're here to experience....,{'rouge1': Score(precision=0.21428571428571427...
...,...,...,...,...,...,...,...,...,...,...,...
995,995,Washington (CNN)The flight voice recorder abo...,Autopilot could have taken control of Germanwi...,b0798163d7ec92301e26c240b42b46e39cc83752,['Washington (CNN)The flight voice recorder a...,[[-0.02787112 0.04069512 -0.05251175 ... 0.0...,[[0. 0. 0.55715317 ... 0. ...,"{0: 0.02218075067928703, 1: 0.0135632252994173...","{0: 12, 1: 12, 2: 2, 3: 16, 4: 4, 5: 16, 6: 5,...",Not only would it have saved this flight and t...,"{'rouge1': Score(precision=0.2222222222222222,..."
996,996,(CNN)At least 54 people have died and 15 other...,Fishing vessels are searching for 15 people st...,41cc7008f92d080b541dec2633ef5eb02674c573,"[""(CNN)At least 54 people have died and 15 oth...",[[ 0.0383624 -0.01905133 -0.04371632 ... -0.0...,[[0. 0.52291012 0. 0. ...,"{0: 0.1999280206766779, 1: 0.1009022597777621,...","{0: 0, 1: 0, 2: 1, 3: 2, 4: 3, 5: 0, 6: 1, 7: ...","The shipwreck was swift, with the trawler goin...","{'rouge1': Score(precision=0.4146341463414634,..."
997,997,"(The Hollywood Reporter)Stan Freberg, whose fr...","Stan Freberg was famed comedian, song parodist...",5588d9eecfe630bbe8ac41600cd377c0bc810670,"['(The Hollywood Reporter)Stan Freberg, whose ...",[[ 0.02801352 -0.09566732 -0.0494417 ... -0.0...,[[0. 0. 0.65951979 ... 0. ...,"{0: 0.08193730566590483, 1: 0.0066964287670381...","{0: 2, 1: 1, 2: 2, 3: 6, 4: 16, 5: 5, 6: 6, 7:...",Freberg also was known for his musical parodie...,"{'rouge1': Score(precision=0.2962962962962963,..."
998,998,(CNN)Indiana's controversial religious freedom...,Gov. Mike Pence is making the right call to fi...,3ad0320476e8dda3a00604efdcf766e91766c815,"[""(CNN)Indiana's controversial religious freed...",[[-0.00568578 0.01028095 -0.04418644 ... 0.0...,[[0. 0. 0. 0. ...,"{0: 0.0351588611173316, 1: 0.00821918080857677...","{0: 2, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: ...","In those situations, if there is no compelling...","{'rouge1': Score(precision=0.3157894736842105,..."


In [7]:
mean_scores

{'rouge1': {'precision': {'mean': 0.3407263029473328,
   'std': 0.1714761329906591},
  'recall': {'mean': 0.17562435152529407, 'std': 0.08615377905280887},
  'fmeasure': {'mean': 0.22056597603764896, 'std': 0.09913533148068941}},
 'rouge2': {'precision': {'mean': 0.09040968761747424,
   'std': 0.12386408072434218},
  'recall': {'mean': 0.043062800056240066, 'std': 0.06065724511887369},
  'fmeasure': {'mean': 0.055619626019775464, 'std': 0.07481716436706824}},
 'rougeL': {'precision': {'mean': 0.2259092986517515,
   'std': 0.12659368825605039},
  'recall': {'mean': 0.1158661914017097, 'std': 0.06252613143398407},
  'fmeasure': {'mean': 0.14531493683158805, 'std': 0.07182457207648496}},
 'rougeLsum': {'precision': {'mean': 0.2627320682610475,
   'std': 0.14440327079279003},
  'recall': {'mean': 0.1350625081417477, 'std': 0.0724144163050376},
  'fmeasure': {'mean': 0.16961583497134278, 'std': 0.08390192897874428}}}