<a href="https://colab.research.google.com/github/anaghasid/GenPruning/blob/main/T5_Pruning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet datasets
!pip install --quiet accelerate
!pip install --quiet bitsandbytes

In [None]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset, load_metric
import accelerate
from sklearn.metrics import accuracy_score
import time
from torch.utils.data import DataLoader
from collections import defaultdict

In [None]:
# Load a pretrained model and tokenizer
model_name = "google-t5/t5-small"
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,
                                                           device_map=device)
                                                          #  quantization_config=quantization_config)  # Assuming binary classification task
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load the GLUE dataset (SST-2 as an example)
dataset = load_dataset("glue", "cola", split='validation')
print("Sample from the dataset:", dataset[0])
metric = load_metric("glue", "cola")

Sample from the dataset: {'sentence': 'The sailors rode the breeze clear of the rocks.', 'label': 1, 'idx': 0}


In [None]:
print(len(dataset))

1043


In [None]:
dataset[0:10]

{'sentence': ['The sailors rode the breeze clear of the rocks.',
  'The weights made the rope stretch over the pulley.',
  'The mechanical doll wriggled itself loose.',
  'If you had eaten more, you would want less.',
  'As you eat the most, you want the least.',
  'The more you would want, the less you would eat.',
  'I demand that the more John eat, the more he pays.',
  'Mary listens to the Grateful Dead, she gets depressed.',
  'The angrier Mary got, the more she looked at pictures.',
  'The higher the stakes, the lower his expectations are.'],
 'label': [1, 1, 1, 1, 0, 0, 0, 1, 1, 1],
 'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}

In [None]:
# Preprocess the data
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)
    tokenized_inputs['label'] = examples['label']
    return tokenized_inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

In [None]:
# Use DataLoader to create batches
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
dataloader = DataLoader(encoded_dataset, batch_size=16)

In [None]:
import pandas as pd
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

dataloader = DataLoader(encoded_dataset, batch_size=16)


# Display a sample batch from the dataloader
for batch in dataloader:
    # print(batch['label'])
    input_ids = batch['input_ids'].tolist()
    attention_masks = batch['attention_mask'].tolist()
    labels = batch['label'].tolist()

    truncated_input_ids = [ids[:10] for ids in input_ids]  # Show only the first 10 tokens
    truncated_attention_masks = [masks[:10] for masks in attention_masks]

    df = pd.DataFrame({
        'Input IDs': truncated_input_ids,
        'Attention Mask': truncated_attention_masks,
        'Label': labels
    })

    print(df)
    break


                                                Input IDs                  Attention Mask  Label
0      [37, 30899, 6102, 15, 8, 15825, 964, 13, 8, 12288]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
1        [37, 1293, 7, 263, 8, 13888, 6606, 147, 8, 3197]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
2   [37, 8168, 14295, 3, 210, 23983, 1361, 1402, 6044, 5]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
3         [156, 25, 141, 16929, 72, 6, 25, 133, 241, 705]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
4               [282, 25, 3, 1544, 8, 167, 6, 25, 241, 8]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      0
5              [37, 72, 25, 133, 241, 6, 8, 705, 25, 133]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      0
6              [27, 2173, 24, 8, 72, 1079, 3, 1544, 6, 8]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      0
7        [3790, 3011, 7, 12, 8, 350, 2206, 1329, 9651, 6]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
8           [37, 3, 1468, 6711, 3790, 530, 6, 8, 72, 255]  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]      1
9           [37, 1146, 8, 8474

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)



In [None]:

# Modify model based on chromosome
def modify_model(model, chromosome):
    num_heads = model.config.num_heads
    num_blocks = model.config.num_layers
    print(num_blocks)
    # num_ffn = model.config.num_hidden_layers

    # Disable attention heads
    heads_to_prune = defaultdict(list)
    for i, gene in enumerate(chromosome):
        if gene == 0:
          block_num = i//num_heads
          head_num = i%num_heads
          heads_to_prune[block_num].append(head_num)

    print(heads_to_prune)
    if heads_to_prune:

        for block_num in heads_to_prune:
            print("Pruning")
            model.transformer.encoder.block[block_num].layer[0].SelfAttention.prune_heads(heads_to_prune[block_num])
            # if(bloack_num!=0):
            model.transformer.decoder.block[block_num].layer[0].SelfAttention.prune_heads(heads_to_prune[block_num])
            # model.transformer.decoder.block[block_num].layer[1].EncDecAttention.prune_heads(heads_to_prune[block_num])

            print(block_num, model.transformer.encoder.block[block_num].layer[0].SelfAttention.n_heads)
            # model.prune_heads(heads_to_prune)

            # model.transformer.encoder.block[block_num].layer[0].SelfAttention.q = prune_linear_layer(model.transformer.encoder.block[block_num].layer[0].SelfAttention.q, index)
            # model.transformer.encoder.block[block_num].layer[0].SelfAttention.k = prune_linear_layer(model.transformer.encoder.block[block_num].layer[0].SelfAttention.k, index)
            # model.transformer.encoder.block[block_num].layer[0].SelfAttention.v = prune_linear_layer(model.transformer.encoder.block[block_num].layer[0].SelfAttention.v, index)
            # model.transformer.encoder.block[block_num].layer[0].SelfAttention.o = prune_linear_layer(model.transformer.encoder.block[block_num].layer[0].SelfAttention.o, index, dim=1)

In [None]:
def evaluate_fitness(chromosome, model, encoded_dataset, metric):
    # Modify the model according to the chromosome
    # if(any(chromosome)):
    modify_model(model, chromosome)

    start_time = time.time()
    predictions = []
    labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            outputs = model(input_ids, attention_mask=attention_mask)

            # Get predictions
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            labels.extend(batch['label'].cpu().numpy())

    # Print a few predictions and their corresponding labels
    for i in range(10):
        print(f"Prediction: {predictions[i]}, Label: {labels[i]}")

    # Evaluate the model
    # eval_result = trainer.evaluate()
    # fitness = eval_result['eval_accuracy']
    fitness = accuracy_score(labels, predictions)
    print(f"Validation Accuracy: {fitness:.4f}")
    end_time = time.time()
    print("Execution time = ",end_time-start_time)

    return fitness

In [None]:
# without modifying the model
evaluate_fitness([False], model, encoded_dataset, metric)

6
defaultdict(<class 'list'>, {0: [0]})
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 0
Prediction: 1, Label: 0
Prediction: 1, Label: 0
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Validation Accuracy: 0.6913
Execution time =  264.4926497936249


0.6912751677852349

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
'''
trainer = Trainer(
      model=model,
      args=training_args,
      eval_dataset=encoded_dataset,
      compute_metrics=lambda p: metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
  )

# Evaluate the model
eval_result = trainer.evaluate()
fitness = eval_result['eval_accuracy']
print(fitness)
'''


In [None]:
for i in dir(model):
  if not i.startswith("_"):
    print(i)

T_destination
active_adapter
active_adapters
add_adapter
add_memory_hooks
add_model_tags
add_module
apply
base_model
base_model_prefix
bfloat16
buffers
call_super_init
can_generate
children
classification_head
compile
compute_transition_scores
config
config_class
contrastive_search
cpu
create_extended_attention_mask_for_decoder
cuda
dequantize
device
disable_adapters
disable_input_require_grads
double
dtype
dummy_inputs
dump_patches
enable_adapters
enable_input_require_grads
estimate_tokens
eval
extra_repr
float
floating_point_ops
forward
framework
from_pretrained
generate
generation_config
get_adapter_state_dict
get_buffer
get_extended_attention_mask
get_extra_state
get_head_mask
get_input_embeddings
get_memory_footprint
get_output_embeddings
get_parameter
get_position_embeddings
get_submodule
gradient_checkpointing_disable
gradient_checkpointing_enable
half
heal_tokens
hf_device_map
init_weights
invert_attention_mask
ipu
is_gradient_checkpointing
is_parallelizable
load_adapter
load_s

In [None]:
import random

def create_random_binary_list(length, percentage_of_zeros):
    # Calculate the number of zeros and ones based on the percentage
    num_zeros = int(length * (percentage_of_zeros / 100))
    num_ones = length - num_zeros

    # Create the list with the required number of 0s and 1s
    binary_list = [0] * num_zeros + [1] * num_ones

    # Shuffle the list to randomize the order
    random.shuffle(binary_list)

    return binary_list

In [None]:
def initialize_chromosome(num_heads, num_layers):
    return create_random_binary_list(num_heads * num_layers, 30)

In [None]:
print(model.config.is_decoder)

False


In [None]:
# chromosome = np.random.randint(2, size=(model.config.num_attention_heads + model.config.num_hidden_layers))
model  = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, device_map=device)
chromosome = initialize_chromosome(model.config.num_heads, model.config.num_layers)
print("Chromosome:", chromosome)
fitness = evaluate_fitness(chromosome, model, encoded_dataset, metric)
print("Fitness score (accuracy):", fitness)


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chromosome: [1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
6
defaultdict(<class 'list'>, {0: [2, 5, 7], 1: [4, 5, 7], 2: [2, 3, 4, 5, 6], 3: [4, 5, 7]})
Pruning
0 5
Pruning
1 5
Pruning
2 3
Pruning
3 5
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 0
Prediction: 0, Label: 0
Prediction: 0, Label: 0
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Validation Accuracy: 0.3087
Execution time =  236.12051367759705
Fitness score (accuracy): 0.3087248322147651


In [None]:
# chromosome = np.random.randint(2, size=(model.config.num_attention_heads + model.config.num_hidden_layers))
model  = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, device_map=device)
chromosome = initialize_chromosome(model.config.num_heads, model.config.num_layers)    # only attn heads for now
print("Chromosome:", chromosome)
fitness = evaluate_fitness(chromosome, model, encoded_dataset, metric)
print("Fitness score (accuracy):", fitness)


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chromosome: [0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 0 0 1 1 0 1 0 1 0 1 0]
6
defaultdict(<class 'list'>, {0: [0, 2, 3, 6], 1: [0, 1, 2, 4, 5, 6, 7], 2: [0, 1, 2, 4, 5, 6, 7], 3: [0, 3, 6], 4: [3, 5, 6], 5: [1, 3, 5, 7]})
Pruning
0 4
Pruning
1 1
Pruning
2 1
Pruning
3 5
Pruning
4 5
Pruning
5 4
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 0
Prediction: 1, Label: 0
Prediction: 0, Label: 0
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Prediction: 1, Label: 1
Validation Accuracy: 0.6769
Execution time =  243.05039262771606
Fitness score (accuracy): 0.6768935762224353


In [None]:
def find_size(model):
  total_size_in_bytes = sum(p.numel() * p.element_size() for p in model.parameters())

  # Convert to megabytes (MB)
  total_size_in_megabytes = total_size_in_bytes / (1024 ** 2)

  print(f"Model size: {total_size_in_megabytes:.2f} MB")

In [None]:
m  = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, device_map=device)
print(find_size(m), find_size(model))

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model size: 231.82 MB
Model size: 217.82 MB
None None


In [None]:
chromosome = initialize_chromosome(model.config.num_heads, model.config.num_layers)
fitness = evaluate_fitness(chromosome, model, encoded_dataset, metric)
print("Fitness score (accuracy):", fitness)

Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 0
Prediction: 0, Label: 0
Prediction: 0, Label: 0
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Prediction: 0, Label: 1
Validation Accuracy: 0.3087
Execution time =  236.12051367759705
Fitness score (accuracy): 0.3087248322147651


In [None]:
# Selection (Tournament Selection)
def select_parents(population, fitness_scores):
    parents = []
    for _ in range(population_size):
        tournament = np.random.choice(population_size, 2)
        winner = tournament[np.argmax(fitness_scores[tournament])]
        parents.append(population[winner])
    return np.array(parents)

In [None]:
# Crossover (Single-point crossover)
def crossover(parent1, parent2):
    if np.random.rand() < crossover_rate:
        point = np.random.randint(1, len(parent1) - 1)
        child1 = np.concatenate([parent1[:point], parent2[point:]])
        child2 = np.concatenate([parent2[:point], parent1[point:]])
    else:
        child1, child2 = parent1, parent2
    return child1, child2

In [None]:
# Mutation (Flip bit mutation)
def mutate(chromosome):
    for i in range(len(chromosome)):
        if np.random.rand() < mutation_rate:
            chromosome[i] = 1 - chromosome[i]
    return chromosome

In [None]:
def genetic_algorithm(model, num_generations, desired_sparsity):
  population = initialize_population(population_size, len(model.config.attention_heads) + len(model.config.hidden_layers))
  for generation in range(num_generations):
      fitness_scores = np.array([evaluate_fitness(chrom, model) for chrom in population])
      parents = select_parents(population, fitness_scores)
      new_population = []
      for i in range(0, population_size, 2):
          parent1, parent2 = parents[i], parents[i + 1]
          child1, child2 = crossover(parent1, parent2)
          child1 = mutate(child1)
          child2 = mutate(child2)
          new_population.extend([child1, child2])
      population = np.array(new_population)

      # Check for desired sparsity level
      sparsity_levels = np.mean(population == 0, axis=1)
      if np.any(sparsity_levels >= desired_sparsity):
          best_chromosome = population[np.argmax(sparsity_levels)]
          break
  return best_chromosome