# Libraries

In [1]:
import pandas as pd

# Data
##
https://huggingface.co/datasets/fancyzhx/ag_news

In [2]:
from datasets import load_dataset

ds = load_dataset("antash420/text-summarization-alpaca-format")

  from .autonotebook import tqdm as notebook_tqdm
  table = cls._concat_blocks(blocks, axis=0)


In [3]:
inputs = ds['train']['input']
references = ds['train']['output']

In [4]:
print(len(inputs))

287113


In [5]:
print(inputs[0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

In [6]:
print(references[0])

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


# Data Preprocessing

## Tokenizing Input

In [7]:
from nltk.tokenize import sent_tokenize

tokenized_inputs = [sent_tokenize(text) for text in inputs]

In [8]:
print(len(tokenized_inputs))

287113


In [23]:
print(tokenized_inputs[0])

["LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.", 'Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.', '"I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.', '"I don\'t think I\'ll be particularly extravagant.', '"The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs."', 'At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office c

## Sentence Embedding

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
# Load pre-trained BERT model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, suitable for sentence embeddings

In [None]:
def get_sentence_embeddings(sentences, batch_size=32):
    """
    Generate sentence embeddings using a pre-trained BERT model.
    
    Args:
        sentences (list of str): List of sentences.
        batch_size (int): Number of sentences to process in each batch.
        
    Returns:
        np.ndarray: Sentence embeddings.
    """
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [28]:
sentence_embeddings = []

for sentences in tokenized_inputs:
    embeddings = get_sentence_embeddings(sentences)
    sentence_embeddings.append(embeddings)

In [29]:
print(len(sentence_embeddings))

287113


In [10]:
import numpy as np

In [None]:
# Convert list of embeddings to a single numpy array
sentence_embeddings = np.vstack(sentence_embeddings)

# Save embeddings to a file
np.save('../data/sentence_embeddings.npy', sentence_embeddings)

print("Embeddings saved to 'sentence_embeddings.npy'")

Embeddings saved to 'sentence_embeddings.npy'


In [None]:
# # Save embeddings to a .txt file
# # WARNING THIS FILE WILL BE 100gb in size, run on your own risk :D
# np.savetxt('sentence_embeddings.txt', sentence_embeddings)

In [None]:
sentence_embeddings = np.load('../data/sentence_embeddings.npy')

In [12]:
sentence_embeddings.shape

(11224982, 384)

# Experiment Parts

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import random

In [21]:
def calculate_fitness(chromosome, sentences, embeddings, reference_summary):
    """
    Calculate fitness of a chromosome.

    Args:
        chromosome (np.ndarray): Binary array representing selected sentences.
        sentences (list of str): List of sentences in the document.
        embeddings (np.ndarray): Sentence embeddings.
        reference_summary (str): Reference summary.

    Returns:
        float: Fitness score.
    """
    # Selected sentences
    selected_sentences = [sentences[i] for i, bit in enumerate(chromosome) if bit == 1]
    if not selected_sentences:
        return 0  # Avoid empty summaries

    summary_text = " ".join(selected_sentences)

    # ROUGE Score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_summary, summary_text)
    content_score = rouge_scores['rouge1'].fmeasure  # F1-score of ROUGE-1

    # Diversity (Cosine Similarity)
    selected_embeddings = np.array([embeddings[i] for i, bit in enumerate(chromosome) if bit == 1])
    if len(selected_embeddings) > 1:
        similarity_matrix = cosine_similarity(selected_embeddings)
        redundancy_penalty = np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)])
        diversity_score = 1 - redundancy_penalty  # Higher is better
    else:
        diversity_score = 1  # Maximum diversity for a single sentence

    # Final Fitness
    fitness = content_score * 0.7 + diversity_score * 0.3  # Weighted fitness
    return fitness


In [15]:

def initialize_population(num_sentences, population_size):
    """
    Initialize a population of binary chromosomes.
    
    Args:
        num_sentences (int): Number of sentences in the input.
        population_size (int): Number of chromosomes in the population.
        
    Returns:
        np.ndarray: Binary matrix representing the population.
    """
    return np.random.randint(2, size=(population_size, num_sentences))

In [16]:
def select_parents(population, fitness_scores):
    """
    Select two parents using roulette wheel selection.
    
    Args:
        population (np.ndarray): Current population.
        fitness_scores (list of float): Fitness scores for each chromosome.
        
    Returns:
        tuple: Two selected parent chromosomes.
    """
    probabilities = fitness_scores / np.sum(fitness_scores)
    indices = np.random.choice(len(population), size=2, p=probabilities)
    return population[indices[0]], population[indices[1]]

In [26]:
def selection(population, fitness_scores, num_parents):
    parents = np.empty((num_parents, population.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness_scores == np.max(fitness_scores))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = population[max_fitness_idx, :]
        fitness_scores[max_fitness_idx] = -99999999
    return parents

In [27]:
def crossover(parent1, parent2, crossover_rate=0.8):
    """
    Perform single-point crossover on two parents.
    
    Args:
        parent1, parent2 (np.ndarray): Parent chromosomes.
        crossover_rate (float): Probability of crossover.
        
    Returns:
        np.ndarray: Two child chromosomes.
    """
    if np.random.rand() < crossover_rate:
        point = np.random.randint(1, len(parent1) - 1)
        child1 = np.concatenate((parent1[:point], parent2[point:]))
        child2 = np.concatenate((parent2[:point], parent1[point:]))
        return child1, child2
    return parent1, parent2


In [29]:
def mutate(offspring_crossover, mutation_rate=0.01):
    for idx in range(offspring_crossover.shape[0]):
        for _ in range(mutation_rate):
            i = random.randint(0, offspring_crossover.shape[1] - 1)
            offspring_crossover[idx, i] = 1 - offspring_crossover[idx, i]
    return offspring_crossover

In [30]:
def evolve_population(population, sentences, embeddings, reference_summary, mutation_rate=0.01):
    """
    Evolve the population using genetic operations.
    
    Args:
        population (np.ndarray): Current population.
        sentences (list of str): List of sentences in the document.
        embeddings (np.ndarray): Sentence embeddings.
        reference_summary (str): Reference summary.
        mutation_rate (float): Probability of mutation per bit.
        
    Returns:
        np.ndarray: New population.
    """
    fitness_scores = [calculate_fitness(ind, sentences, embeddings, reference_summary) for ind in population]
    new_population = []
    
    for _ in range(len(population) // 2):  # Generate pairs of children
        parent1, parent2 = select_parents(population, fitness_scores)
        child1, child2 = crossover(parent1, parent2)
        child1 = mutate(child1, mutation_rate)
        child2 = mutate(child2, mutation_rate)
        new_population.extend([child1, child2])
    
    return np.array(new_population)


In [34]:
print(references[0])

Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [36]:
pop_size = 10
num_sentences = len(sentences)
num_generations = 50
num_parents_mating = 4
mutation_rate = 2


example_text = tokenized_inputs[0]
embeddings = sentence_embeddings[0]
reference_summary = references[0]
population = initialize_population(pop_size, num_sentences)

for generation in range(num_generations):
    fitness_scores = np.array([calculate_fitness(chromosome, sentences, embeddings, example_text) for chromosome in population])
    parents = selection(population, fitness_scores, num_parents_mating)
    offspring_crossover = crossover(parents, (pop_size - parents.shape[0], num_sentences))
    offspring_mutation = mutate(offspring_crossover, mutation_rate)
    population[0:parents.shape[0], :] = parents
    population[parents.shape[0]:, :] = offspring_mutation

# Best solution
best_chromosome = population[np.argmax(fitness_scores)]
best_summary = " ".join([sentences[i] for i, bit in enumerate(best_chromosome) if bit == 1])
print("Best Summary:", best_summary)

AttributeError: 'list' object has no attribute 'lower'