In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model.safetensors.index.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00001-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/config.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00003-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00002-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00007-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/README.md
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00008-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/tokenizer.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/tokenizer_config.json
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00005-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/model-00006-of-00008.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-9b/2/special_tokens_map.json
/kaggle/input/gemma-2/transformer

In [2]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma-2/transformers/gemma-2-9b")

print("Path to model files:", path)

Path to model files: /kaggle/input/gemma-2/transformers/gemma-2-9b/2


# Perplexity Calculator Using Pretrained Language Models :

This Python class, PerplexityCalculator, leverages the Transformers library by Hugging Face to calculate the perplexity of given text(s) using a pretrained causal language model (e.g., GPT). The implementation supports both single-text and batch processing, and it allows loading models in standard precision or 8-bit quantized mode to optimize memory usage.

In [3]:
import random
import gc
import torch
import pandas as pd
from collections import Counter
from typing import List, Union
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

class PerplexityCalculator:
    def __init__(self, model_path: str, load_in_8bit: bool = False, device_map: str = 'auto'):
        """
        Initialize the Perplexity Calculator with a pretrained model and tokenizer.

        Args:
            model_path (str): Path to the pretrained model or model name from Hugging Face's model hub.
            load_in_8bit (bool): Whether to load the model in 8-bit precision for memory efficiency.
            device_map (str): Device configuration for loading the model (e.g., 'auto' for automatic allocation).
        """
        # Load the tokenizer with trust_remote_code=True for custom models
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True
        )
        
        # Load the model with the appropriate precision settings
        if load_in_8bit:
            # Use 8-bit quantization for memory-efficient model loading
            quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=quantization_config,
                device_map=device_map,
                trust_remote_code=True
            )
        else:
            # Load the model in full precision (float16 for GPUs, float32 otherwise)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map=device_map,
                trust_remote_code=True
            )

        # Initialize the loss function for calculating perplexity
        self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        
        # Set the model to evaluation mode
        self.model.eval()

    def get_perplexity(self, input_texts: Union[str, List[str]], batch_size: int = 8) -> Union[float, List[float]]:
        """
        Compute the perplexity for a single text or a batch of texts.

        Args:
            input_texts (Union[str, List[str]]): A single string or a list of strings for perplexity computation.
            batch_size (int): Number of texts to process in a single batch.

        Returns:
            Union[float, List[float]]: The perplexity for the input text(s).
        """
        # Check if the input is a single text or a list of texts
        single_input = isinstance(input_texts, str)
        input_texts = [input_texts] if single_input else input_texts
        
        # List to store the loss values for each text
        loss_list = []

        # Process the input texts in batches
        for i in range(0, len(input_texts), batch_size):
            batch_texts = input_texts[i:i + batch_size]
            with torch.no_grad():
                # Tokenize the batch of texts
                model_inputs = self.tokenizer(
                    batch_texts,
                    return_tensors='pt',
                    padding=True,
                    truncation=True,
                    max_length=512,
                    add_special_tokens=True,
                ).to(self.model.device)

                # Forward pass through the model
                output = self.model(**model_inputs)
                logits = output.logits

                # Shift the logits and labels to align predictions with the next token
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = model_inputs['input_ids'][..., 1:].contiguous()

                # Calculate the loss for the batch
                loss = self.loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1)
                )

                # Normalize the loss per token and store the results
                batch_losses = loss.view(len(batch_texts), -1).sum(dim=1) / model_inputs['attention_mask'].sum(dim=1)
                loss_list.extend(batch_losses.cpu().tolist())

        # Convert loss to perplexity (PPL = exp(loss))
        ppl = [np.exp(i) for i in loss_list]
        
        # Return a single value if input was a single text, else return a list
        return ppl[0] if single_input else ppl

    def clear_memory(self) -> None:
        """
        Clear memory by deleting the model and tokenizer to free up resources.
        """
        del self.model
        del self.tokenizer
        gc.collect()


# GeneticAlgorithm class :

In [2]:
import random
import gc

# Define the GeneticAlgorithm class
class GeneticAlgorithm:
    def __init__(self, random_state, population_size, generations, mutation_rate, elite_rate):
        """
        Initialize the GeneticAlgorithm class.

        Parameters:
        - random_state (int): Seed for random number generation to ensure reproducibility.
        - population_size (int): Number of individuals in the population.
        - generations (int): Number of generations to run the algorithm.
        - mutation_rate (float): Probability of mutation for each individual.
        - elite_rate (float): Fraction of the population to retain as elite individuals.
        """
        random.seed(random_state)
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.elite_rate = elite_rate

    def order_crossover(self, parent1, parent2):
        """
        Perform order crossover between two parents to produce an offspring.

        Parameters:
        - parent1 (list): First parent individual.
        - parent2 (list): Second parent individual.

        Returns:
        - child (list): New individual created from parents.
        """
        # Select a random slice of the parent1
        start, end = sorted(random.sample(range(len(parent1)), 2))
        child = [None] * len(parent1)
        child[start:end+1] = parent1[start:end+1]

        # Fill remaining positions using elements from parent2
        remaining = parent2.copy()
        for used_word in child:
            if used_word is not None:
                remaining.remove(used_word)
        j = 0
        for i in range(len(child)):
            if child[i] is None:
                child[i] = remaining[j]
                j += 1
        return child

    def mutate(self, individual):
        """
        Apply mutation to an individual by swapping two positions with a given probability.

        Parameters:
        - individual (list): The individual to mutate.

        Returns:
        - individual (list): Mutated individual.
        """
        if random.random() < self.mutation_rate:
            i, j = random.sample(range(len(individual)), 2)
            individual[i], individual[j] = individual[j], individual[i]
        return individual

    def gradient_descent(self, sequence, scorer, learning_rate=0.01, max_iterations=20):
        """
        Apply a simple gradient descent-inspired optimization to refine an individual.

        Parameters:
        - sequence (list): Initial sequence to optimize.
        - scorer (object): Scorer object to evaluate perplexity.
        - learning_rate (float): Not used directly, kept for extension purposes.
        - max_iterations (int): Maximum number of iterations for optimization.

        Returns:
        - best_sequence (list): Optimized sequence with the lowest perplexity.
        """
        current_sequence = sequence[:]
        best_sequence = sequence[:]
        best_perplexity = scorer.get_perplexity(' '.join(best_sequence))

        # Iteratively improve the sequence by swapping elements
        for _ in range(max_iterations):
            neighbor = current_sequence[:]
            i, j = random.sample(range(len(neighbor)), 2)
            neighbor[i], neighbor[j] = neighbor[j], neighbor[i]
            neighbor_perplexity = scorer.get_perplexity(' '.join(neighbor))
            delta = neighbor_perplexity - best_perplexity
            if delta < 0:  # If the perplexity improves
                current_sequence = neighbor[:]
                if neighbor_perplexity < best_perplexity:
                    best_sequence = neighbor[:]
                    best_perplexity = neighbor_perplexity

        return best_sequence

    def solve(self, text, scorer):
        """
        Solve the optimization problem using the genetic algorithm.

        Parameters:
        - text (str): Input text to optimize.
        - scorer (object): Scorer object to calculate perplexity.

        Returns:
        - best_sequence (str): Optimized sequence as a single string.
        - best_energy (float): Perplexity of the best sequence.
        - log_energies (list): List of best perplexities per generation.
        """
        # Split the text into words and create the initial population
        words = text.split()
        word_N = len(words)
        population = [random.sample(words, word_N) for _ in range(self.population_size)]
        log_energies = []

        # Main loop for generations
        for generation in range(self.generations):
            # Calculate fitness (perplexity) for all individuals
            fitness_scores = [scorer.get_perplexity(' '.join(ind)) for ind in population]
            min_perplexity = min(fitness_scores)
            log_energies.append(min_perplexity)
            print(f"Generation {generation + 1}/{self.generations} - Best Perplexity: {min_perplexity}")

            # Select elite individuals based on perplexity
            elite_indices = sorted(range(len(fitness_scores)),
                                   key=lambda k: fitness_scores[k])[:int(self.population_size * self.elite_rate)]
            new_population = [population[i] for i in elite_indices]

            # Create new individuals via crossover until the population is restored
            while len(new_population) < self.population_size:
                parent1, parent2 = random.sample(population, 2)
                child = self.order_crossover(parent1, parent2)
                new_population.append(child)

            # Mutate new population
            for i in range(len(new_population)):
                new_population[i] = self.mutate(new_population[i])

            # Apply gradient descent to elite individuals for further improvement
            for i in range(int(len(new_population) * self.elite_rate)):
                new_population[i] = self.gradient_descent(new_population[i], scorer)

            population = new_population

        # Calculate final fitness scores and select the best individual
        fitness_scores = [scorer.get_perplexity(' '.join(ind)) for ind in population]
        best_individual = population[np.argmin(fitness_scores)]
        best_energy = min(fitness_scores)
        print("\nGenetic Algorithm Optimization Complete")
        print(f"Best Perplexity: {best_energy}")

        return ' '.join(best_individual), best_energy, log_energies


# Optimizing Text Sequences with Genetic Algorithm for Perplexity Minimization

We have split the data into two parts to optimize processing time and resource usage. The first batch contains sequences 1 to 5, while the second batch contains sequence 6. This division allows us to handle smaller chunks of data at a time, minimizing the computational load and ensuring more efficient use of resources during processing.

This script processes and optimizes text sequences using a Genetic Algorithm (GA) for the first batch of a given dataset.

In [4]:
# Function to optimize sequences using Genetic Algorithm (GA) for the first batch (10 generations)
def optimize_sequences_with_ga(batch_size=5):
    """
    Optimize text sequences using a Genetic Algorithm (GA) for the first batch of data.

    Parameters:
    - batch_size (int): Number of rows to process per batch.

    Returns:
    - submission (DataFrame): DataFrame containing the optimized text sequences.
    """
    # Load the sample submission file
    sample_submission = pd.read_csv("/kaggle/input/santa1/sample_submission.csv")
    results = []

    # Initialize the Genetic Algorithm with specific parameters
    ga = GeneticAlgorithm(
        random_state=42,      # Seed for reproducibility
        population_size=10,   # Number of individuals in the population
        generations=20,       # Number of generations for optimization
        mutation_rate=0.1,    # Mutation probability for individuals
        elite_rate=0.2        # Fraction of population considered as elite
    )

    # Determine the number of batches based on batch size
    num_batches = (len(sample_submission) + batch_size - 1) // batch_size
    batch_idx = 0  # Process only the first batch

    # Define the data range for the current batch
    batch_start = batch_idx * batch_size
    batch_end = min((batch_idx + 1) * batch_size, len(sample_submission))
    batch_data = sample_submission.iloc[batch_start:batch_end]

    print(f"\nProcessing batch {batch_idx + 1}/{num_batches}")

    try:
        # Initialize the scorer (PerplexityCalculator) for text evaluation
        scorer = PerplexityCalculator(
            model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2',  # Path to the language model
            load_in_8bit=False  # Model loading option
        )

        # Iterate over rows in the batch
        for idx, row in batch_data.iterrows():
            # Create a specific DataFrame for the current row
            specific_solution = pd.DataFrame({'id': [row['id']], 'text': [row['text']]})
            text_sequence = row['text'].split()

            # Optimize the text sequence using the Genetic Algorithm
            optimized_text, final_score, log_energies = ga.solve(row['text'], scorer)

            # Log the result for the current sequence
            print(f"ID: {row['id']}, Final Perplexity: {final_score}")
            results.append({'id': row['id'], 'text': optimized_text})

        # Save intermediate results to a temporary CSV file
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(f"submission_temp_batch_{batch_idx+1}.csv", index=False)

    except Exception as e:
        # Handle exceptions and log any issues encountered during processing
        print(f"Error processing batch {batch_idx + 1}: {str(e)}")
        for idx, row in batch_data.iterrows():
            # If an error occurs, retain the original text sequence
            results.append({'id': row['id'], 'text': row['text']})
            print("results >>>>>>>>>>>>>>>>")
            print({'id': row['id'], 'text': row['text']})
            print("------------------------------------------------")

    # Final submission for the first batch
    submission = pd.DataFrame(results)
    submission.to_csv("batch1_submission.csv", index=False)
    return submission

if __name__ == "__main__":
    # Entry point of the script
    print("Starting GA optimization...")
    final_submission = optimize_sequences_with_ga()
    print("Optimization with GA completed!")


Starting GA optimization...

Processing batch 1/2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Generation 1/20 - Best Perplexity: 850.8240325202685
Generation 2/20 - Best Perplexity: 621.4654194025813
Generation 3/20 - Best Perplexity: 610.160500583438
Generation 4/20 - Best Perplexity: 374.05994496126846
Generation 5/20 - Best Perplexity: 374.05994496126846
Generation 6/20 - Best Perplexity: 350.0905961076616
Generation 7/20 - Best Perplexity: 336.633887513654
Generation 8/20 - Best Perplexity: 336.633887513654
Generation 9/20 - Best Perplexity: 317.99778895332923
Generation 10/20 - Best Perplexity: 317.99778895332923
Generation 11/20 - Best Perplexity: 317.99778895332923
Generation 12/20 - Best Perplexity: 317.99778895332923
Generation 13/20 - Best Perplexity: 317.99778895332923
Generation 14/20 - Best Perplexity: 317.99778895332923
Generation 15/20 - Best Perplexity: 317.99778895332923
Generation 16/20 - Best Perplexity: 317.99778895332923
Generation 17/20 - Best Perplexity: 317.99778895332923
Generation 18/20 - Best Perplexity: 317.99778895332923
Generation 19/20 - Best Perp

# Genetic Algorithm Optimization for Text Sequences: Second Batch Processing

In [5]:
# Function to optimize sequences with GA for second batch
def optimize_sequences_with_ga(batch_size=5):
    sample_submission = pd.read_csv("/kaggle/input/santa1/sample_submission.csv")
    results = []

    ga = GeneticAlgorithm(
        random_state=42,
        population_size=10,
        generations=20,
        mutation_rate=0.1,
        elite_rate=0.2
    )

    # Calculate the total number of batches
    num_batches = (len(sample_submission) + batch_size - 1) // batch_size

    # Set batch_idx to 1 for the second batch (index starts at 0)
    batch_idx = 1  # Only process the second batch
    
    # Calculate the start and end index for the second batch
    batch_start = batch_idx * batch_size
    batch_end = min((batch_idx + 1) * batch_size, len(sample_submission))
    batch_data = sample_submission.iloc[batch_start:batch_end]

    print(f"\nProcessing batch {batch_idx + 1}/{num_batches}")

    try:
        scorer = PerplexityCalculator(
            model_path='/kaggle/input/gemma-2/transformers/gemma-2-9b/2',
            load_in_8bit=False
        )

        for idx, row in batch_data.iterrows():
            specific_solution = pd.DataFrame({'id': [row['id']], 'text': [row['text']]})
            text_sequence = row['text'].split()

            optimized_text, final_score, log_energies = ga.solve(row['text'], scorer)

            print(f"ID: {row['id']}, Final Perplexity: {final_score}")
            results.append({'id': row['id'], 'text': optimized_text})

        temp_df = pd.DataFrame(results)
        temp_df.to_csv(f"submission_temp_batch_{batch_idx+1}.csv", index=False)

    except Exception as e:
        print(f"Error processing batch {batch_idx + 1}: {str(e)}")
        for idx, row in batch_data.iterrows():
            results.append({'id': row['id'], 'text': row['text']})
            print("results >>>>>>>>>>>>>>>>")
            print({'id': row['id'], 'text': row['text']})
            print("------------------------------------------------")

    # Final submission for the second batch
    submission = pd.DataFrame(results)
    submission.to_csv("submission_id-5.csv", index=False)
    return submission

if __name__ == "__main__":
    print("Starting GA optimization...")
    final_submission = optimize_sequences_with_ga()
    print("Optimization with GA completed!")


Starting GA optimization...

Processing batch 2/2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Generation 1/20 - Best Perplexity: 786.1271512208434
Generation 2/20 - Best Perplexity: 699.2579777721731
Generation 3/20 - Best Perplexity: 608.8964823056507
Generation 4/20 - Best Perplexity: 571.1810975204792
Generation 5/20 - Best Perplexity: 529.6956665489097
Generation 6/20 - Best Perplexity: 520.519328864813
Generation 7/20 - Best Perplexity: 485.67502259252456
Generation 8/20 - Best Perplexity: 453.0076893306675
Generation 9/20 - Best Perplexity: 424.7089026586955
Generation 10/20 - Best Perplexity: 413.866201188035
Generation 11/20 - Best Perplexity: 404.86913074332483
Generation 12/20 - Best Perplexity: 402.5139595181153
Generation 13/20 - Best Perplexity: 399.896852844494
Generation 14/20 - Best Perplexity: 395.08999673949427
Generation 15/20 - Best Perplexity: 390.33161385620883
Generation 16/20 - Best Perplexity: 388.277753654066
Generation 17/20 - Best Perplexity: 382.49290715563285
Generation 18/20 - Best Perplexity: 380.9027081990725
Generation 19/20 - Best Perplexity: 