In [11]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
from scipy.spatial.distance import cosine
import networkx as nx
import community.community_louvain as louvain 
from datasets import load_dataset
from utils import calculate_scores, sum_metrices
import time
import os

# Load data

In [12]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test").select(range(1000))
train_data = pd.DataFrame(dataset)

In [None]:
train_data.shape

# Summarizer

In [22]:
def preprocess_text(df):
    df['sentences'] = df['article'].apply(sent_tokenize)
    return df

def embed_sentences(row, model):
    embeddings = model.encode(row['sentences'], convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy()
    return embeddings

def build_and_add_graph(row, threshold=0.5):
    """
    Builds a sentence similarity graph from embeddings. 
    """
    embeddings = row['embeddings']
    num_sentences = len(embeddings)
    graph = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(i + 1, num_sentences):
            similarity = 1 - cosine(embeddings[i], embeddings[j])
            if similarity > threshold:
                graph[i, j] = similarity
                graph[j, i] = similarity

    return graph

def compute_and_add_pagerank_scores(row):
    """
    Computes PageRank centrality scores for the graph.
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  
    pagerank_scores = nx.pagerank(nx_graph)  # PageRank centrality scores
    return pagerank_scores

def add_communities(row):
    """
    Detects communities using louvain
    """
    graph = row['graph']
    nx_graph = nx.from_numpy_array(graph)  
    partition = louvain.best_partition(nx_graph)  # Detect communities
    return partition

def rank_sentences_and_extract_summary(row, max_length=1):
    """
    Ranks sentences and extracts a summary based on ranked sentences.
    """
    sentences = row['sentences']
    pagerank_scores = row['pagerank_scores']
    communities = row['communities']

    # Rank sentences based on PageRank and community 
    ranked_sentences = sorted(enumerate(sentences), key=lambda x: pagerank_scores[x[0]] * (1 + communities[x[0]]), reverse=True)
    
    # Extract the top-ranked sentences
    ranked_sentences = [sentences[i] for i, _ in ranked_sentences]
    
    summary = " ".join(ranked_sentences[:max_length])
    
    return summary

In [23]:
start_time = time.time()
# 1. Preprocess the dataset
train_data = preprocess_text(train_data)

# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
train_data['embeddings'] = train_data.apply(lambda row: embed_sentences(row, model), axis=1)

# 3. Build the sentence similarity graph
train_data['graph'] = train_data.apply(build_and_add_graph, axis=1)

# 4. Compute centrality scores
train_data['pagerank_scores'] = train_data.apply(compute_and_add_pagerank_scores, axis=1)

# 5. Detect communities
train_data['communities'] = train_data.apply(lambda row: add_communities(row), axis=1)

# 6. Rank sentences and generate summaries
train_data['generated_summary'] = train_data.apply(rank_sentences_and_extract_summary, axis=1)

In [None]:
print(f'The flow takes {round(end_time/60, 3)} minutes')

In [None]:
train_data.head()

# Calculate matrices

In [None]:
train_data.shape

In [24]:
df = calculate_scores(train_data, 'generated_summary', 'highlights')

directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'custom_method_l1.csv'))
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'custom_method_l1.csv')

In [17]:
df_tmp = pd.read_csv(os.path.join(directory, 'custom_method_l3.csv'))

In [None]:
df_tmp

In [None]:
mean_scores