In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gemma/transformers/2b/2/model.safetensors.index.json
/kaggle/input/gemma/transformers/2b/2/gemma-2b.gguf
/kaggle/input/gemma/transformers/2b/2/config.json
/kaggle/input/gemma/transformers/2b/2/model-00001-of-00002.safetensors
/kaggle/input/gemma/transformers/2b/2/model-00002-of-00002.safetensors
/kaggle/input/gemma/transformers/2b/2/tokenizer.json
/kaggle/input/gemma/transformers/2b/2/tokenizer_config.json
/kaggle/input/gemma/transformers/2b/2/special_tokens_map.json
/kaggle/input/gemma/transformers/2b/2/.gitattributes
/kaggle/input/gemma/transformers/2b/2/tokenizer.model
/kaggle/input/gemma/transformers/2b/2/generation_config.json


In [4]:
# !pip install bert-score
# !pip install rouge-score
# from nltk.translate.bleu_score import sentence_bleu
# from rouge_score import rouge_scorer
import nltk
# from transformers.modeling_utils import prune_linear_layer
from collections import defaultdict
from datasets import load_dataset
from tqdm import tqdm
from torch.utils.data.dataset import Dataset
import gc

In [None]:
class IndexDataset(Dataset):
    def __init__(self, tensors):
        self.tensors = tensors

    def __getitem__(self, index):
        return self.tensors[index]

    def __len__(self):
        return len(self.tensors)

def process_data(samples, tokenizer, seq_len, field_name):
    test_ids = tokenizer("\n\n".join(samples[field_name]), return_tensors='pt').input_ids[0]
    test_ids_batch = []
    nsamples = test_ids.numel() // seq_len

    for i in range(nsamples):
        batch = test_ids[(i * seq_len):((i + 1) * seq_len)]
        test_ids_batch.append(batch)
    test_ids_batch = torch.stack(test_ids_batch)
    return IndexDataset(tensors=test_ids_batch)
       

def get_loaders(tokenizer, seq_len=2048, batch_size = 8):
    test_data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
#     test_dataset = process_data(test_data, tokenizer, seq_len, 'text')
    test_dataset = process_data(test_data[0:100], tokenizer, seq_len, 'text')

    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return test_loader

In [30]:
def PPLMetric(model, tokenizer, seq_len=128, batch_size = 4, device="cuda"):
    metric = {}
    test_loader = get_loaders(tokenizer, seq_len=seq_len, batch_size = batch_size)
    metric = ppl_eval(model, test_loader, device)
    print(metric)
    return metric


def ppl_eval(model, test_lodaer, device):
    nlls = []
    n_samples = 0
    with torch.no_grad():
        for batch in tqdm(test_lodaer):
            batch = batch.to(device)
            # CHANGE THIS:
            output = model(batch)
            lm_logits = output.logits

            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = batch[:, 1:].contiguous()

            loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
            loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.view(-1))
            nlls.append(loss)
            del batch
            for _ in range(10):
                torch.cuda.empty_cache()
                gc.collect()
    #print(torch.cat(nlls, dim=-1).mean())
    ppl = np.exp(torch.cat(nlls, dim=-1).mean().item())
    return ppl.item()

In [48]:
def make_model():
    try:
        del model
        print("Deleted existing")
        for _ in range(10):
            torch.cuda.empty_cache()
            gc.collect()
    except:
        pass
    model = GemmaForCausalLM.from_pretrained(model_name)  # Use the appropriate model class
    model.to(device)
    return model

In [13]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel, GemmaForCausalLM 
import torch
model_name = '/kaggle/input/gemma/transformers/2b/2'  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = make_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
import random
def create_random_binary_list(length, percentage_of_zeros):
    num_zeros = int(length * percentage_of_zeros)
    num_ones = length - num_zeros

    # Create the list with the required number of 0s and 1s
    binary_list = [0] * num_zeros + [1] * num_ones

    # Shuffle the list to randomize the order
    random.shuffle(binary_list)

    return binary_list

def initialize_chromosome(num_genes):
  # initialize chromosome with given sparsity percentage
  return create_random_binary_list(num_genes, SPARSITY_RATE)

In [15]:
def modify_model(model, chromosome):
    num_heads = model.config.num_attention_heads
    num_blocks = model.config.num_hidden_layers
    # num_ffn = model.config.num_hidden_layers

    # Disable attention heads
    heads_to_prune = defaultdict(list)
    for i, gene in enumerate(chromosome):
        if gene == 0:
          block_num = i//num_heads
          head_num = i%num_heads
          heads_to_prune[block_num].append(head_num)

    head_dim = model.config.head_dim
    if heads_to_prune:
        print("Pruning heads in model")
        with torch.no_grad():
            for block in range(block_num): 
                for head in heads_to_prune[block]:
                        # Zero-out the corresponding rows in the q_proj, k_proj, and v_proj
                        start_index = head * head_dim
                        end_index = (head + 1) * head_dim
                        model.model.layers[block].self_attn.q_proj.weight[start_index:end_index, :] = 0
                        model.model.layers[block].self_attn.k_proj.weight[start_index:end_index, :] = 0
                        model.model.layers[block].self_attn.v_proj.weight[start_index:end_index, :] = 0

    #                     model.model.layers[block_num].self_attn.q_proj.weight = model.model.layers[block_num].self_attn.q_proj.weight.to(torch.int8)
    #                     model.model.layers[block_num].self_attn.k_proj.weight = model.model.layers[block_num].self_attn.q_proj.weight.to(torch.int8)
    #                     model.model.layers[block_num].self_attn.c_proj.weight = model.model.layers[block_num].self_attn.q_proj.weight.to(torch.int8)


    return model

In [38]:
def evaluate_fitness(chromosome):
    model = make_model()
    model = modify_model(model,chromosome)
    metric = PPLMetric(model, tokenizer)

In [17]:
SPARSITY_RATE = 0.3

In [18]:
def find_size(model):
  total_size_in_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
  total_size_in_megabytes = total_size_in_bytes / (1024 ** 2)
  print(f"Model size: {total_size_in_megabytes:.2f} MB")

In [21]:
# model = make_model()
metric = PPLMetric(model, tokenizer)
find_size(model)

100%|██████████| 564/564 [03:24<00:00,  2.76it/s]


648.7878413946279
Model size: 9560.29 MB


In [22]:
chrom = initialize_chromosome(model.config.num_attention_heads*model.config.num_hidden_layers)
# model = modify_model(model, chrom)
# find_size(model)

In [None]:
!nvidia-smi


In [23]:
POPN_SIZE = 8
crossover_rate = 0.7
mutation_rate = 0.08


In [24]:
def initialize_population(chromosome_length):
  # initialize random population
  population = []
  for _ in range(POPN_SIZE):
    chromosome = initialize_chromosome(chromosome_length)
    population.append(chromosome)
  return population

In [25]:
def select_parents(population, fitness_scores, num_parents):
    # Normalize fitness scores to create a probability distribution
    total_fitness = np.sum(fitness_scores)
    probabilities = fitness_scores / total_fitness

    # Select parents based on their fitness proportion (roulette wheel selection)
    # selected_parents = np.random.choice(population, size=num_parents, p=probabilities, replace=True)
    selected_parents = random.choices(population, weights=probabilities, k=num_parents)


    return np.array(selected_parents)

In [26]:
# Crossover (Single-point crossover)
def crossover(parent1, parent2):
    if np.random.rand() < crossover_rate:
        point = np.random.randint(1, len(parent1) - 1)
        child1 = np.concatenate([parent1[:point], parent2[point:]])
        child2 = np.concatenate([parent2[:point], parent1[point:]])
    else:
        child1, child2 = parent1, parent2
    return child1, child2

In [27]:
# Mutation (Flip bit mutation)
def mutate(chromosome):
    for i in range(len(chromosome)):
        if np.random.rand() < mutation_rate:
            chromosome[i] = 1 - chromosome[i]

    target_zeros = int(len(chromosome) * SPARSITY_RATE)

    for c in range(model.config.num_attention_heads-1, len(chromosome), model.config.num_attention_heads):
      # this part to ensure that each layer has at least one attention head
      start = c-model.config.num_attention_heads-1
      enc_part = chromosome[start:c]
      num_ones = np.sum(enc_part)  # Count the number of 1s in the chromosome
      if num_ones==0:
        chromosome[start] = 1

    for i in range(len(chromosome)):
        if np.random.rand() < mutation_rate:
            if chromosome[i] == 1 and num_ones > target_ones:
                chromosome[i] = 0  # Flip 1 to 0 only if there are too many 1s
                num_ones -= 1
            elif chromosome[i] == 0 and num_ones < target_ones:
                chromosome[i] = 1  # Flip 0 to 1 only if there are too few 1s
                num_ones += 1
    return chromosome


In [28]:
def elitism_and_selection(population, fitness_scores, num_elites, num_parents):
    # Elitism: Keep the top num_elites individuals
    elite_indices = np.argsort(fitness_scores)[-num_elites:]  # Get indices of top individuals
    elites = [population[i] for i in elite_indices]

    # Perform roulette wheel selection for the rest of the parents
    remaining_population = np.delete(population, elite_indices, axis=0)
    remaining_fitness_scores = np.delete(fitness_scores, elite_indices)

    num_to_select = num_parents - num_elites
    selected_parents = select_parents(remaining_population, remaining_fitness_scores, num_to_select)

    # Combine elites and selected parents
    next_generation = np.vstack((elites, selected_parents))

    return next_generation

In [34]:
model = make_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
num_attention_heads = model.config.num_attention_heads
num_hidden_layers = model.config.num_hidden_layers

In [39]:
import matplotlib.pyplot as plt

def genetic_algorithm(num_generations, desired_sparsity):
  population = initialize_population(num_attention_heads*num_hidden_layers)  # Initialize the population
  accuracy_per_generation = []  # List to store highest accuracy values for each generation
  for generation in range(num_generations):
      fitness_scores = np.array([evaluate_fitness(chrom) for chrom in population])
      best_chromosome = population[np.argmax(fitness_scores)]
      print("new fitness scores:", fitness_scores)
      print(f"best chromosome in generation {generation} is {best_chromosome} with accuracy {fitness_scores[np.argmax(fitness_scores)]}")
      accuracy_per_generation.append(fitness_scores[np.argmax(fitness_scores)])
      parents = elitism_and_selection(population, fitness_scores, 4, POPN_SIZE)
      # parents = select_parents(population, fitness_scores, POPN_SIZE)
      new_population = []
      for i in range(0, POPN_SIZE, 2):
          parent1, parent2 = parents[i], parents[i + 1]
          child1, child2 = crossover(parent1, parent2)
          # child1 = mutate(child1)
          # child2 = mutate(child2)
          new_population.extend([child1, child2])
      population = np.array(new_population)
  generations = list(range(1, len(accuracy_per_generation) + 1))


  return best_chromosome

In [None]:
genetic_algorithm(2, SPARSITY_RATE)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Pruning heads in model


 10%|▉         | 54/564 [02:30<23:30,  2.77s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7b74bb513e50>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
 12%|█▏        | 68/564 [03:09<23:05,  2.79s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7b74bb513e50>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
 12%|█▏        | 69/564 [03:11<22:54,  2.78s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7b74bb513e50>>
Traceback (most rece