<a href="https://colab.research.google.com/github/bythyag/gpt2-pruning-kit/blob/main/scratchpad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title 1. Install and Import Libraries
# Install necessary libraries
!pip install transformers torch accelerate -q

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
import copy
from torch import nn
import os

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Suppress TensorFlow logging if backend is used

print("Libraries installed and imported.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
#@title 2. Setup Model and Tokenizer
# Define the model name
model_name = "distilgpt2" # A smaller, faster version of GPT-2

# Set the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
print(f"Loading tokenizer for '{model_name}'...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add padding token if it doesn't exist (GPT-2 typically uses EOS as PAD)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set pad_token to eos_token")

print(f"Loading original model '{model_name}'...")
original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_model.to(device)
original_model.eval() # Set to evaluation mode

print("Original model loaded successfully.")

# Print some model info
original_num_layers = original_model.config.n_layer
original_num_params = sum(p.numel() for p in original_model.parameters())
print(f"\nOriginal Model ('{model_name}'):")
print(f" - Number of layers: {original_num_layers}")
print(f" - Total parameters: {original_num_params / 1e6:.2f} M")

Using device: cuda
Loading tokenizer for 'distilgpt2'...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Set pad_token to eos_token
Loading original model 'distilgpt2'...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Original model loaded successfully.

Original Model ('distilgpt2'):
 - Number of layers: 6
 - Total parameters: 81.91 M


In [3]:
#@title 3. Define the Layer Pruning Function

def prune_transformer_layers_randomly(model, prune_ratio=0.5):
    """
    Prunes a specified ratio of transformer layers randomly from a Hugging Face model.

    Args:
        model (PreTrainedModel): The model to prune (e.g., GPT2LMHeadModel).
        prune_ratio (float): The fraction of layers to prune (e.g., 0.5 for 50%).

    Returns:
        PreTrainedModel: A deep copy of the original model with layers pruned.
    """
    # Create a deep copy to avoid modifying the original model
    pruned_model = copy.deepcopy(model)

    # --- Identify the transformer layers ---
    # This path might change for different model architectures.
    # For GPT-2 and DistilGPT2, layers are typically under model.transformer.h
    try:
        layers = pruned_model.transformer.h
        layer_container_attribute = 'h'
        layer_parent_module = pruned_model.transformer
    except AttributeError:
        print("Could not find layers at 'model.transformer.h'. Trying common alternatives...")
        # Add checks for other common structures if needed (e.g., model.encoder.layer)
        raise AttributeError("Cannot find the transformer layer list in the model. Check model architecture.")

    original_num_layers = len(layers)
    num_layers_to_prune = int(original_num_layers * prune_ratio)
    num_layers_to_keep = original_num_layers - num_layers_to_prune

    if num_layers_to_keep <= 0:
        raise ValueError(f"Pruning ratio {prune_ratio} is too high, results in 0 or fewer layers!")
    if num_layers_to_prune <= 0:
        print("Pruning ratio results in 0 layers being pruned. Returning copy of original model.")
        return pruned_model

    print(f"\n--- Pruning Information ---")
    print(f"Original number of layers: {original_num_layers}")
    print(f"Pruning ratio: {prune_ratio:.2f}")
    print(f"Number of layers to prune: {num_layers_to_prune}")
    print(f"Number of layers to keep: {num_layers_to_keep}")

    # Get indices of all layers
    layer_indices = list(range(original_num_layers))

    # Randomly choose indices to *keep*
    indices_to_keep = sorted(random.sample(layer_indices, num_layers_to_keep))
    indices_pruned = sorted(list(set(layer_indices) - set(indices_to_keep)))

    print(f"Indices of layers KEPT: {indices_to_keep}")
    print(f"Indices of layers PRUNED: {indices_pruned}")

    # Create a new ModuleList containing only the layers we want to keep
    # Ensure the order is maintained according to the kept indices
    pruned_layers_list = [layers[i] for i in indices_to_keep]
    new_module_list = nn.ModuleList(pruned_layers_list)

    # Replace the original layer list with the new pruned list
    setattr(layer_parent_module, layer_container_attribute, new_module_list)

    # --- Update Model Configuration (Important!) ---
    # Many models rely on config attributes like `n_layer` during the forward pass
    # or when initializing other components.
    if hasattr(pruned_model.config, 'n_layer'):
        pruned_model.config.n_layer = num_layers_to_keep
        print(f"Updated model config 'n_layer' to: {pruned_model.config.n_layer}")
    else:
        print("Warning: Model config does not have 'n_layer'. Ensure model forward pass doesn't depend on it.")

    # Ensure the pruned model is on the correct device and in eval mode
    pruned_model.to(model.device)
    pruned_model.eval()

    pruned_num_params = sum(p.numel() for p in pruned_model.parameters())
    print(f"\nPruned Model:")
    print(f" - Number of layers remaining: {len(pruned_model.transformer.h)}")
    print(f" - Total parameters: {pruned_num_params / 1e6:.2f} M")
    print(f" - Parameter reduction: {(original_num_params - pruned_num_params) / 1e6:.2f} M ({(original_num_params - pruned_num_params) / original_num_params * 100:.2f}%)")
    print("---------------------------\n")

    return pruned_model

In [17]:
#@title 4. Perform Pruning (50% Randomly)

# Define the pruning ratio
pruning_ratio = 0.3

# Prune the model
pruned_model = prune_transformer_layers_randomly(original_model, prune_ratio=pruning_ratio)

print("Model pruning complete.")


--- Pruning Information ---
Original number of layers: 6
Pruning ratio: 0.30
Number of layers to prune: 1
Number of layers to keep: 5
Updated model config 'n_layer' to: 5

Pruned Model:
 - Number of layers remaining: 5
 - Total parameters: 74.82 M
 - Parameter reduction: 7.09 M (8.65%)
---------------------------

Model pruning complete.


In [18]:
#@title 5. Compare Text Generation

# Define a test prompt
prompt = "The future of artificial intelligence is"
print(f"Using prompt: \"{prompt}\"")

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# --- Generation Parameters (Keep them consistent for comparison) ---
# Using sampling for more 'creative' and diverse output
# Feel free to experiment with these parameters
generation_config = {
    "max_length": 75,          # Max length of the generated sequence
    "num_return_sequences": 1, # Number of sequences to generate
    "do_sample": True,         # Use sampling (True) or greedy decoding (False)
    "top_k": 50,               # Consider only top_k tokens for sampling
    "top_p": 0.95,             # Use nucleus sampling (cumulative probability cutoff)
    "temperature": 0.7,        # Controls randomness (lower = more deterministic)
    "pad_token_id": tokenizer.eos_token_id # Set pad token ID to EOS token ID
}

print("\n--- Generating with Original Model ---")
# Generate text using the original model
with torch.no_grad(): # Disable gradient calculations for inference
    outputs_original = original_model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask, # Pass attention mask
        **generation_config
    )

# Decode and print the generated text
generated_text_original = tokenizer.decode(outputs_original[0], skip_special_tokens=True)
print("Original Model Output:")
print(generated_text_original)


print("\n--- Generating with Pruned Model ---")
# Generate text using the pruned model
with torch.no_grad():
    outputs_pruned = pruned_model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask, # Pass attention mask
        **generation_config
    )

# Decode and print the generated text
generated_text_pruned = tokenizer.decode(outputs_pruned[0], skip_special_tokens=True)
print("\nPruned Model Output:")
print(generated_text_pruned)

print("\n--- Comparison Summary ---")
print(f"Original Layers: {original_num_layers}")
print(f"Pruned Layers:   {pruned_model.config.n_layer}")
print(f"Prompt:         \"{prompt}\"")
print("\nOriginal Output:\n", generated_text_original)
print("\nPruned Output:\n", generated_text_pruned)

Using prompt: "The future of artificial intelligence is"

--- Generating with Original Model ---
Original Model Output:
The future of artificial intelligence is a complex one, but it is becoming more and more complex.

























































--- Generating with Pruned Model ---

Pruned Model Output:
The future of artificial intelligence is not going to be a matter of debate, but it will be a matter of life and the world‹​where you are,” that says.

--- Comparison Summary ---
Original Layers: 6
Pruned Layers:   5
Prompt:         "The future of artificial intelligence is"

Original Output:
 The future of artificial intelligence is a complex one, but it is becoming more and more complex.

























































Pruned Output:
 The future of artificial intelligence is not going to be a matter of debate, but it will be a matter of life and the world‹​where you are,” that says.


In [19]:
#@title 6. Install `datasets` Library for Evaluation
# Install the datasets library from Hugging Face
!pip install datasets -q

import datasets
import time
from tqdm.notebook import tqdm # For progress bars

print("datasets library installed.")

datasets library installed.


In [20]:
#@title 7. Define Perplexity Calculation Function

def calculate_perplexity(model, tokenizer, dataset_name="wikitext", dataset_config="wikitext-2-raw-v1", split="test", stride=512, device="cuda"):
    """
    Calculates perplexity for a given model on a dataset.

    Args:
        model (PreTrainedModel): The language model.
        tokenizer (PreTrainedTokenizer): The tokenizer for the model.
        dataset_name (str): Name of the dataset in Hugging Face datasets library.
        dataset_config (str): Configuration of the dataset.
        split (str): Dataset split to use (e.g., 'test', 'validation').
        stride (int): The stride to use when tokenizing long texts.
        device (str): The device to run calculations on ('cuda' or 'cpu').

    Returns:
        float: The calculated perplexity. Returns float('inf') if calculation fails.
    """
    print(f"\nCalculating perplexity for model on '{dataset_name}/{dataset_config}' [{split}] split...")
    try:
        # Load the dataset
        print("Loading dataset...")
        data = datasets.load_dataset(dataset_name, dataset_config, split=split)
        print("Dataset loaded.")

        # Tokenize the dataset
        # Concatenate all texts and then chunk them
        print("Tokenizing dataset...")
        all_text = "\n\n".join(data['text']) # Join all examples into one large string
        encodings = tokenizer(all_text, return_tensors='pt')
        print("Tokenization complete.")

        max_length = model.config.n_positions # Max sequence length the model can handle
        seq_len = encodings.input_ids.size(1)

        nlls = [] # Negative log likelihoods
        prev_end_loc = 0

        print(f"Processing sequence of length {seq_len} with stride {stride} and max_length {max_length}...")
        # Iterate through the tokenized text with a stride
        for begin_loc in tqdm(range(0, seq_len, stride)):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc # May be different from stride on last loop
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100 # Ignore predictions for tokens before the stride window

            if input_ids.size(1) == 0: # Skip empty sequences if stride leads to it
                continue

            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)
                # The model returns the average NLL over the tokens when labels are provided
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        # Calculate perplexity
        if not nlls:
             print("Warning: No valid negative log likelihoods calculated.")
             return float('inf')

        # Check for NaNs or Infs in nlls
        valid_nlls = [nll for nll in nlls if not torch.isnan(nll) and not torch.isinf(nll)]
        if not valid_nlls:
            print("Warning: All calculated NLLs were NaN or Inf.")
            return float('inf')
        elif len(valid_nlls) < len(nlls):
            print(f"Warning: Filtered out {len(nlls) - len(valid_nlls)} NaN/Inf NLL values.")

        mean_nll = torch.stack(valid_nlls).mean()
        perplexity = torch.exp(mean_nll)

        print(f"Calculation complete. Mean NLL: {mean_nll.item():.4f}")
        return perplexity.item()

    except Exception as e:
        print(f"Error calculating perplexity: {e}")
        import traceback
        traceback.print_exc()
        return float('inf') # Return infinity on error

In [21]:
#@title 8. Define Inference Speed Test Function

def measure_inference_speed(model, tokenizer, prompt, generation_config, n_runs=10, device="cuda"):
    """
    Measures the average inference speed for text generation.

    Args:
        model (PreTrainedModel): The language model.
        tokenizer (PreTrainedTokenizer): The tokenizer.
        prompt (str): The input prompt for generation.
        generation_config (dict): Dictionary of generation parameters.
        n_runs (int): Number of generation runs to average over.
        device (str): Device to run inference on.

    Returns:
        float: Average generation time in milliseconds.
    """
    print(f"\nMeasuring inference speed (average over {n_runs} runs)...")
    model.to(device)
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    total_time = 0.0

    # Warm-up run (often the first run is slower due to setup/caching)
    print("Performing warm-up run...")
    with torch.no_grad():
        _ = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            **generation_config
        )
    torch.cuda.synchronize() # Ensure GPU work is done before starting timer

    print(f"Starting timed runs...")
    for i in tqdm(range(n_runs)):
        start_time = time.time()
        with torch.no_grad():
            _ = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                **generation_config
            )
        torch.cuda.synchronize() # Ensure GPU work is done before stopping timer
        end_time = time.time()
        total_time += (end_time - start_time)

    avg_time_ms = (total_time / n_runs) * 1000 # Convert to milliseconds
    print(f"Speed test complete. Average time: {avg_time_ms:.2f} ms")
    return avg_time_ms

In [22]:
#@title 9. Run Evaluation and Display Results

# --- Configuration ---
perplexity_dataset = "wikitext"
perplexity_config = "wikitext-2-raw-v1" # Use 'wikitext-103-raw-v1' for larger dataset (slower)
perplexity_split = "test"
inference_prompt = "The future of artificial intelligence is"
# Use the same generation config as before for speed test
speed_test_generation_config = {
    "max_length": 75,
    "num_return_sequences": 1,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "temperature": 0.7,
    "pad_token_id": tokenizer.eos_token_id
}
speed_test_runs = 10 # Number of runs for averaging inference speed

# --- Run Evaluations ---

# 1. Perplexity Evaluation
print("\n=== Perplexity Evaluation ===")
ppl_original = calculate_perplexity(original_model, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)
ppl_pruned = calculate_perplexity(pruned_model, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)

# 2. Inference Speed Evaluation
print("\n=== Inference Speed Evaluation ===")
speed_original_ms = measure_inference_speed(original_model, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)
speed_pruned_ms = measure_inference_speed(pruned_model, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)

# 3. Model Size (Parameters)
params_original_M = sum(p.numel() for p in original_model.parameters() if p.requires_grad) / 1e6
params_pruned_M = sum(p.numel() for p in pruned_model.parameters() if p.requires_grad) / 1e6

# --- Display Results ---
print("\n\n--- Evaluation Summary ---")
print(f"Model Name: {model_name}")
print(f"Pruning Ratio: {pruning_ratio:.2f} (Random Layer Removal)")
print("-" * 30)
print("| Metric               | Original Model | Pruned Model   | Change (%)      |")
print("|----------------------|----------------|----------------|-----------------|")
print(f"| Perplexity (PPL) ↓   | {ppl_original:>14.2f} | {ppl_pruned:>14.2f} | {((ppl_pruned - ppl_original) / ppl_original * 100):>+14.2f}% |") # Lower is better
print(f"| Inference Speed (ms) ↓| {speed_original_ms:>14.2f} | {speed_pruned_ms:>14.2f} | {((speed_pruned_ms - speed_original_ms) / speed_original_ms * 100):>+14.2f}% |") # Lower is better
print(f"| Parameters (M) ↓    | {params_original_M:>14.2f} | {params_pruned_M:>14.2f} | {((params_pruned_M - params_original_M) / params_original_M * 100):>+14.2f}% |") # Lower is better
print(f"| Layers               | {original_model.config.n_layer:>14} | {pruned_model.config.n_layer:>14} | {((pruned_model.config.n_layer - original_model.config.n_layer) / original_model.config.n_layer * 100):>+14.2f}% |")
print("-" * 30)
print(f"Perplexity evaluated on: '{perplexity_dataset}/{perplexity_config}' [{perplexity_split}]")
print(f"Inference speed measured for max_length={speed_test_generation_config['max_length']} on prompt: \"{inference_prompt[:30]}...\"")

print("\n--- Qualitative Comparison (Revisiting Generation) ---")
# (You can re-run Cell 5 or just use the previous output)
print(f"Prompt:         \"{prompt}\"")
print("\nOriginal Output:\n", generated_text_original)
print("\nPruned Output:\n", generated_text_pruned)


=== Perplexity Evaluation ===

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 3.6710

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 4.6183

=== Inference Speed Evaluation ===

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 403.38 ms

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 340.77 ms


--- Evaluation Summary ---
Model Name: distilgpt2
Pruning Ratio: 0.30 (Random Layer Removal)
------------------------------
| Metric               | Original Model | Pruned Model   | Change (%)      |
|----------------------|----------------|----------------|-----------------|
| Perplexity (PPL) ↓   |          39.29 |         101.32 |        +157.89% |
| Inference Speed (ms) ↓|         403.38 |         340.77 |         -15.52% |
| Parameters (M) ↓    |          81.91 |          74.82 |          -8.65% |
| Layers               |              6 |              5 |         -16.67% |
------------------------------
Perplexity evaluated on: 'wikitext/wikitext-2-raw-v1' [test]
Inference speed measured for max_length=75 on prompt: "The future of artificial intel..."

--- Qualitative Comparison (Revisiting Generation) ---
Prompt:         "The future of artificial intelligence is"

Original Output:
 The future of artificial intelligence is a complex 

In [23]:
#@title 10. Define Function to Prune a SINGLE Layer

import copy
from torch import nn

def prune_single_transformer_layer(original_model, layer_index_to_remove):
    """
    Prunes a single specified transformer layer from a Hugging Face model.

    Args:
        original_model (PreTrainedModel): The baseline model (will NOT be modified).
        layer_index_to_remove (int): The index of the layer to remove (0-based).

    Returns:
        PreTrainedModel: A deep copy of the original model with the specified layer removed.
                         Returns None if the index is invalid.
    """
    # Create a deep copy to avoid modifying the original model
    pruned_model = copy.deepcopy(original_model)

    # Identify the transformer layers
    try:
        layers = pruned_model.transformer.h
        layer_container_attribute = 'h'
        layer_parent_module = pruned_model.transformer
    except AttributeError:
        print("Error: Could not find layers at 'model.transformer.h'.")
        return None # Indicate failure

    original_num_layers = len(layers)

    # Validate index
    if not 0 <= layer_index_to_remove < original_num_layers:
        print(f"Error: Invalid layer index {layer_index_to_remove}. Must be between 0 and {original_num_layers - 1}.")
        return None # Indicate failure

    num_layers_to_keep = original_num_layers - 1

    print(f"\n--- Pruning Layer {layer_index_to_remove} ---")
    print(f"Original number of layers: {original_num_layers}")
    print(f"Keeping {num_layers_to_keep} layers.")

    # Create a new list containing all layers *except* the one to remove
    pruned_layers_list = []
    for i in range(original_num_layers):
        if i != layer_index_to_remove:
            pruned_layers_list.append(layers[i])

    # Create a new ModuleList and replace the old one
    new_module_list = nn.ModuleList(pruned_layers_list)
    setattr(layer_parent_module, layer_container_attribute, new_module_list)

    # Update Model Configuration
    if hasattr(pruned_model.config, 'n_layer'):
        pruned_model.config.n_layer = num_layers_to_keep
        print(f"Updated model config 'n_layer' to: {pruned_model.config.n_layer}")
    else:
        print("Warning: Model config does not have 'n_layer'.")

    pruned_model.to(original_model.device)
    pruned_model.eval()

    pruned_num_params = sum(p.numel() for p in pruned_model.parameters())
    print(f"Pruned Model (Layer {layer_index_to_remove} removed):")
    print(f" - Layers remaining: {len(pruned_model.transformer.h)}")
    print(f" - Total parameters: {pruned_num_params / 1e6:.2f} M")
    print("---------------------------\n")

    return pruned_model

In [24]:
#@title 11. Run Sequential Pruning Evaluation

import pandas as pd
import gc # Garbage collector

# --- Configuration (reuse from previous steps) ---
# Make sure these variables are defined from running previous cells:
# original_model, tokenizer, device
# perplexity_dataset, perplexity_config, perplexity_split
# inference_prompt, speed_test_generation_config, speed_test_runs

# --- Store Original Model Results ---
original_num_layers = original_model.config.n_layer
all_results = []

print("--- Evaluating Original Model (Baseline) ---")
# Ensure original model PPL and Speed are calculated if not already done
if 'ppl_original' not in locals() or 'speed_original_ms' not in locals():
    print("Calculating baseline metrics for the original model...")
    ppl_original = calculate_perplexity(original_model, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)
    speed_original_ms = measure_inference_speed(original_model, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)
    params_original_M = sum(p.numel() for p in original_model.parameters() if p.requires_grad) / 1e6

# Store baseline results
original_result = {
    "Layer Removed": "None (Original)",
    "Layers": original_num_layers,
    "Perplexity (PPL)": ppl_original,
    "Inference Speed (ms)": speed_original_ms,
    "Parameters (M)": params_original_M,
    "PPL Change (%)": 0.0,
    "Speed Change (%)": 0.0
}
all_results.append(original_result)
print("Baseline results stored.")


# --- Iterate Through Each Layer for Pruning ---
print(f"\n--- Starting Sequential Pruning Evaluation (Removing 1 Layer at a Time) ---")
for i in range(original_num_layers):
    print(f"\n>>> Processing: Removing Layer {i} <<<")

    # 1. Prune the specific layer (starting from original model each time)
    pruned_model_single = prune_single_transformer_layer(original_model, i)

    if pruned_model_single is None:
        print(f"Skipping evaluation for layer {i} due to pruning error.")
        continue # Skip to next iteration if pruning failed

    # 2. Evaluate the pruned model
    print(f"--- Evaluating Model with Layer {i} Removed ---")
    ppl_pruned_single = calculate_perplexity(pruned_model_single, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)
    speed_pruned_single_ms = measure_inference_speed(pruned_model_single, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)
    params_pruned_single_M = sum(p.numel() for p in pruned_model_single.parameters() if p.requires_grad) / 1e6

    # 3. Calculate changes
    ppl_change = ((ppl_pruned_single - ppl_original) / ppl_original * 100) if ppl_original else float('inf')
    speed_change = ((speed_pruned_single_ms - speed_original_ms) / speed_original_ms * 100) if speed_original_ms else float('inf')

    # 4. Store results
    current_result = {
        "Layer Removed": i,
        "Layers": pruned_model_single.config.n_layer,
        "Perplexity (PPL)": ppl_pruned_single,
        "Inference Speed (ms)": speed_pruned_single_ms,
        "Parameters (M)": params_pruned_single_M,
        "PPL Change (%)": ppl_change,
        "Speed Change (%)": speed_change
    }
    all_results.append(current_result)
    print(f"Results stored for model with layer {i} removed.")

    # 5. Clean up memory (important in Colab)
    del pruned_model_single
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Cleaned up memory after processing layer {i}.")


# --- Display Combined Results ---
print("\n\n--- Sequential Pruning Evaluation Summary ---")

# Create a pandas DataFrame for better display
results_df = pd.DataFrame(all_results)
results_df.set_index("Layer Removed", inplace=True)

# Format floats for readability
pd.options.display.float_format = '{:,.2f}'.format

print(results_df)

# --- Identify Most Impactful Layer ---
# Find the layer whose removal resulted in the highest PPL (worst performance degradation)
# Exclude the original model row for finding the max impact
pruned_rows = results_df.drop("None (Original)")
if not pruned_rows.empty:
    most_impactful_layer_ppl = pruned_rows["Perplexity (PPL)"].idxmax()
    max_ppl = pruned_rows["Perplexity (PPL)"].max()
    max_ppl_change = pruned_rows.loc[most_impactful_layer_ppl, "PPL Change (%)"]

    print("\n--- Analysis ---")
    print(f"Removing Layer {most_impactful_layer_ppl} had the most significant negative impact on performance.")
    print(f"  - Resulting PPL: {max_ppl:,.2f} ({max_ppl_change:+.2f}% change from original)")

    # You could also analyze speed changes if desired
    fastest_layer_removal = pruned_rows["Inference Speed (ms)"].idxmin()
    min_speed = pruned_rows["Inference Speed (ms)"].min()
    min_speed_change = pruned_rows.loc[fastest_layer_removal, "Speed Change (%)"]
    print(f"Removing Layer {fastest_layer_removal} resulted in the fastest inference speed.")
    print(f"  - Resulting Speed: {min_speed:,.2f} ms ({min_speed_change:+.2f}% change from original)")

else:
    print("\n--- Analysis ---")
    print("No results from pruned models available for analysis.")

--- Evaluating Original Model (Baseline) ---
Baseline results stored.

--- Starting Sequential Pruning Evaluation (Removing 1 Layer at a Time) ---

>>> Processing: Removing Layer 0 <<<

--- Pruning Layer 0 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 0 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 0 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.4006

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 460.28 ms
Results stored for model with layer 0 removed.
Cleaned up memory after processing layer 0.

>>> Processing: Removing Layer 1 <<<

--- Pruning Layer 1 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 1 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 1 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 4.4000

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 229.94 ms
Results stored for model with layer 1 removed.
Cleaned up memory after processing layer 1.

>>> Processing: Removing Layer 2 <<<

--- Pruning Layer 2 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 2 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 2 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 4.4217

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 248.60 ms
Results stored for model with layer 2 removed.
Cleaned up memory after processing layer 2.

>>> Processing: Removing Layer 3 <<<

--- Pruning Layer 3 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 3 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 3 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 4.2423

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 369.78 ms
Results stored for model with layer 3 removed.
Cleaned up memory after processing layer 3.

>>> Processing: Removing Layer 4 <<<

--- Pruning Layer 4 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 4 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 4 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 4.6183

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 297.44 ms
Results stored for model with layer 4 removed.
Cleaned up memory after processing layer 4.

>>> Processing: Removing Layer 5 <<<

--- Pruning Layer 5 ---
Original number of layers: 6
Keeping 5 layers.
Updated model config 'n_layer' to: 5
Pruned Model (Layer 5 removed):
 - Layers remaining: 5
 - Total parameters: 74.82 M
---------------------------

--- Evaluating Model with Layer 5 Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.0172

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 344.71 ms
Results stored for model with layer 5 removed.
Cleaned up memory after processing layer 5.


--- Sequential Pruning Evaluation Summary ---
                 Layers  Perplexity (PPL)  Inference Speed (ms)  \
Layer Removed                                                     
None (Original)       6             39.29                403.38   
0                     5         12,095.66                460.28   
1                     5             81.45                229.94   
2                     5             83.24                248.60   
3                     5             69.57                369.78   
4                     5            101.32                297.44   
5                     5            410.44                344.71   

                 Parameters (M)  PPL Change (%)  Speed Change (%)  
Layer Removed                                                      
None (Original)           81.91            0.00              0.00  
0       

In [25]:
#@title 12. Define Function to Prune MULTIPLE Layers by Index

import copy
from torch import nn
import itertools # For generating combinations

def prune_multiple_transformer_layers(original_model, layer_indices_to_remove):
    """
    Prunes specific transformer layers identified by their indices.

    Args:
        original_model (PreTrainedModel): The baseline model (will NOT be modified).
        layer_indices_to_remove (list or set or tuple): Indices of layers to remove (0-based).

    Returns:
        PreTrainedModel: A deep copy of the original model with specified layers removed.
                         Returns None if indices are invalid or result in zero layers.
    """
    # Create a deep copy
    pruned_model = copy.deepcopy(original_model)

    # Identify layers
    try:
        layers = pruned_model.transformer.h
        layer_container_attribute = 'h'
        layer_parent_module = pruned_model.transformer
    except AttributeError:
        print("Error: Could not find layers at 'model.transformer.h'.")
        return None

    original_num_layers = len(layers)
    indices_to_remove_set = set(layer_indices_to_remove) # Use set for efficient checking

    # Validate indices
    if not all(0 <= idx < original_num_layers for idx in indices_to_remove_set):
        invalid_indices = [idx for idx in indices_to_remove_set if not (0 <= idx < original_num_layers)]
        print(f"Error: Invalid layer indices detected: {invalid_indices}. Max index is {original_num_layers - 1}.")
        return None

    num_layers_to_remove = len(indices_to_remove_set)
    num_layers_to_keep = original_num_layers - num_layers_to_remove

    if num_layers_to_keep <= 0:
        print(f"Error: Pruning indices {layer_indices_to_remove} would result in 0 layers.")
        return None

    print(f"\n--- Pruning Layers {sorted(list(indices_to_remove_set))} ---")
    print(f"Original number of layers: {original_num_layers}")
    print(f"Number of layers to remove: {num_layers_to_remove}")
    print(f"Number of layers to keep: {num_layers_to_keep}")

    # Create the new list of layers to keep, preserving relative order
    pruned_layers_list = []
    kept_indices = []
    for i in range(original_num_layers):
        if i not in indices_to_remove_set:
            pruned_layers_list.append(layers[i])
            kept_indices.append(i)

    print(f"Indices of layers KEPT: {kept_indices}")

    # Create new ModuleList and replace
    new_module_list = nn.ModuleList(pruned_layers_list)
    setattr(layer_parent_module, layer_container_attribute, new_module_list)

    # Update configuration
    if hasattr(pruned_model.config, 'n_layer'):
        pruned_model.config.n_layer = num_layers_to_keep
        print(f"Updated model config 'n_layer' to: {pruned_model.config.n_layer}")

    pruned_model.to(original_model.device)
    pruned_model.eval()

    pruned_num_params = sum(p.numel() for p in pruned_model.parameters())
    print(f"Pruned Model (Layers {sorted(list(indices_to_remove_set))} removed):")
    print(f" - Layers remaining: {len(pruned_model.transformer.h)}")
    print(f" - Total parameters: {pruned_num_params / 1e6:.2f} M")
    print("---------------------------\n")

    return pruned_model

In [26]:
#@title 13. Run Combinatorial Pruning Evaluation (2 to 5 Layers Removed)

import pandas as pd
import gc
import time
from itertools import combinations

# --- Configuration (reuse from previous steps) ---
# Ensure these variables are defined:
# original_model, tokenizer, device
# ppl_original, speed_original_ms, params_original_M (baseline metrics)
# perplexity_dataset, perplexity_config, perplexity_split
# inference_prompt, speed_test_generation_config, speed_test_runs

# --- Storage for Results ---
# You can potentially add to the 'all_results' list from the single-layer removal
# or start fresh. Let's start fresh for clarity here, but include the baseline.
combinatorial_results = []

# Add baseline if not already added to a combined list
if 'original_result' in locals():
     combinatorial_results.append(original_result)
else: # Calculate if needed
     print("Calculating baseline metrics for the original model...")
     ppl_original = calculate_perplexity(original_model, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)
     speed_original_ms = measure_inference_speed(original_model, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)
     params_original_M = sum(p.numel() for p in original_model.parameters() if p.requires_grad) / 1e6
     original_num_layers = original_model.config.n_layer
     original_result = {
         "Layers Removed": "None (Original)",
         "Num Removed": 0,
         "Layers": original_num_layers,
         "Perplexity (PPL)": ppl_original,
         "Inference Speed (ms)": speed_original_ms,
         "Parameters (M)": params_original_M,
         "PPL Change (%)": 0.0,
         "Speed Change (%)": 0.0
     }
     combinatorial_results.append(original_result)

print(f"\n--- Starting Combinatorial Pruning Evaluation ---")
print(f"WARNING: This will evaluate {sum(1 for k in range(2, original_model.config.n_layer) for _ in combinations(range(original_model.config.n_layer), k))} combinations and may take a long time.")
start_time_total = time.time()

layer_indices = list(range(original_model.config.n_layer))

# Iterate through the number of layers to remove (k=2, 3, 4, 5)
for k_layers_to_remove in range(2, original_model.config.n_layer): # Stop before removing all layers
    print(f"\n===== Evaluating Combinations with {k_layers_to_remove} Layers Removed =====")
    comb_count = 0
    total_combs_for_k = len(list(combinations(layer_indices, k_layers_to_remove))) # Calculate total for progress

    # Iterate through all combinations of k indices to remove
    for indices_to_remove in combinations(layer_indices, k_layers_to_remove):
        comb_count += 1
        indices_tuple = tuple(sorted(indices_to_remove)) # Use sorted tuple as identifier
        print(f"\n>>> Processing Combination {comb_count}/{total_combs_for_k} (Remove Layers: {indices_tuple}) <<<")
        start_time_comb = time.time()

        # 1. Prune the specific combination
        pruned_model_comb = prune_multiple_transformer_layers(original_model, indices_to_remove)

        if pruned_model_comb is None:
            print(f"Skipping evaluation for combination {indices_tuple} due to pruning error.")
            continue

        # 2. Evaluate the pruned model
        print(f"--- Evaluating Model with Layers {indices_tuple} Removed ---")
        try:
            ppl_pruned_comb = calculate_perplexity(pruned_model_comb, tokenizer, perplexity_dataset, perplexity_config, perplexity_split, device=device)
            speed_pruned_comb_ms = measure_inference_speed(pruned_model_comb, tokenizer, inference_prompt, speed_test_generation_config, n_runs=speed_test_runs, device=device)
            params_pruned_comb_M = sum(p.numel() for p in pruned_model_comb.parameters() if p.requires_grad) / 1e6
        except Exception as e:
            print(f"!!! Evaluation failed for combination {indices_tuple}: {e}")
            ppl_pruned_comb = float('inf') # Assign inf PPL on failure
            speed_pruned_comb_ms = float('inf')
            params_pruned_comb_M = sum(p.numel() for p in pruned_model_comb.parameters() if p.requires_grad) / 1e6 # Params might still be calculable

        # 3. Calculate changes
        ppl_change = ((ppl_pruned_comb - ppl_original) / ppl_original * 100) if ppl_original and ppl_pruned_comb != float('inf') else float('inf')
        speed_change = ((speed_pruned_comb_ms - speed_original_ms) / speed_original_ms * 100) if speed_original_ms and speed_pruned_comb_ms != float('inf') else float('inf')

        # 4. Store results
        current_result = {
            "Layers Removed": str(indices_tuple), # Store as string for DataFrame index
            "Num Removed": k_layers_to_remove,
            "Layers": pruned_model_comb.config.n_layer,
            "Perplexity (PPL)": ppl_pruned_comb,
            "Inference Speed (ms)": speed_pruned_comb_ms,
            "Parameters (M)": params_pruned_comb_M,
            "PPL Change (%)": ppl_change,
            "Speed Change (%)": speed_change
        }
        combinatorial_results.append(current_result)
        comb_time = time.time() - start_time_comb
        print(f"Results stored for combination {indices_tuple}. Time taken: {comb_time:.2f}s")

        # 5. Clean up memory
        del pruned_model_comb
        gc.collect()
        torch.cuda.empty_cache()
        print(f"Cleaned up memory after processing combination {indices_tuple}.")


# --- Display Combined Results ---
print("\n\n--- Combinatorial Pruning Evaluation Summary ---")
total_time = time.time() - start_time_total
print(f"Total evaluation time: {total_time / 60:.2f} minutes")

# Create DataFrame
results_comb_df = pd.DataFrame(combinatorial_results)
results_comb_df.set_index("Layers Removed", inplace=True)

# Sort for better readability (e.g., by number removed, then by PPL)
results_comb_df.sort_values(by=["Num Removed", "Perplexity (PPL)"], ascending=[True, True], inplace=True)

# Format floats
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', 100) # Show more rows if needed

print(results_comb_df)

# --- Basic Analysis ---
print("\n--- Analysis ---")
# Find worst PPL overall (excluding original)
pruned_rows_comb = results_comb_df.drop("None (Original)")
if not pruned_rows_comb.empty:
     worst_ppl_comb_idx = pruned_rows_comb["Perplexity (PPL)"].idxmax()
     worst_ppl_comb = pruned_rows_comb.loc[worst_ppl_comb_idx, "Perplexity (PPL)"]
     worst_ppl_comb_change = pruned_rows_comb.loc[worst_ppl_comb_idx, "PPL Change (%)"]
     worst_ppl_num_removed = pruned_rows_comb.loc[worst_ppl_comb_idx, "Num Removed"]

     print(f"Worst overall PPL ({worst_ppl_comb:,.2f}) occurred when removing {worst_ppl_num_removed} layers: {worst_ppl_comb_idx} ({worst_ppl_comb_change:+.2f}% change)")

     # Find best PPL for each number of layers removed
     for k in range(2, original_model.config.n_layer):
         rows_for_k = pruned_rows_comb[pruned_rows_comb["Num Removed"] == k]
         if not rows_for_k.empty:
             best_ppl_for_k_idx = rows_for_k["Perplexity (PPL)"].idxmin() # Lowest PPL is best *within* this group
             best_ppl_for_k = rows_for_k.loc[best_ppl_for_k_idx, "Perplexity (PPL)"]
             print(f"  - Best PPL when removing {k} layers: {best_ppl_for_k:,.2f} (Layers removed: {best_ppl_for_k_idx})")


else:
     print("No results from pruned models available for analysis.")


--- Starting Combinatorial Pruning Evaluation ---

===== Evaluating Combinations with 2 Layers Removed =====

>>> Processing Combination 1/15 (Remove Layers: (0, 1)) <<<

--- Pruning Layers [0, 1] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [2, 3, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [0, 1] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (0, 1) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 12.1824

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 326.38 ms
Results stored for combination (0, 1). Time taken: 33.77s
Cleaned up memory after processing combination (0, 1).

>>> Processing Combination 2/15 (Remove Layers: (0, 2)) <<<

--- Pruning Layers [0, 2] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [1, 3, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [0, 2] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (0, 2) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.1132

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 325.50 ms
Results stored for combination (0, 2). Time taken: 35.59s
Cleaned up memory after processing combination (0, 2).

>>> Processing Combination 3/15 (Remove Layers: (0, 3)) <<<

--- Pruning Layers [0, 3] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [1, 2, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [0, 3] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (0, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.5904

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 355.51 ms
Results stored for combination (0, 3). Time taken: 33.90s
Cleaned up memory after processing combination (0, 3).

>>> Processing Combination 4/15 (Remove Layers: (0, 4)) <<<

--- Pruning Layers [0, 4] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [1, 2, 3, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [0, 4] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (0, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.0386

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 416.20 ms
Results stored for combination (0, 4). Time taken: 35.08s
Cleaned up memory after processing combination (0, 4).

>>> Processing Combination 5/15 (Remove Layers: (0, 5)) <<<

--- Pruning Layers [0, 5] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [1, 2, 3, 4]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [0, 5] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (0, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.2324

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 374.67 ms
Results stored for combination (0, 5). Time taken: 34.05s
Cleaned up memory after processing combination (0, 5).

>>> Processing Combination 6/15 (Remove Layers: (1, 2)) <<<

--- Pruning Layers [1, 2] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 3, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [1, 2] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (1, 2) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 5.9470

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 308.65 ms
Results stored for combination (1, 2). Time taken: 34.30s
Cleaned up memory after processing combination (1, 2).

>>> Processing Combination 7/15 (Remove Layers: (1, 3)) <<<

--- Pruning Layers [1, 3] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 2, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [1, 3] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (1, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 5.0525

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 274.90 ms
Results stored for combination (1, 3). Time taken: 34.37s
Cleaned up memory after processing combination (1, 3).

>>> Processing Combination 8/15 (Remove Layers: (1, 4)) <<<

--- Pruning Layers [1, 4] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 2, 3, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [1, 4] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (1, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 5.2323

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 270.36 ms
Results stored for combination (1, 4). Time taken: 33.95s
Cleaned up memory after processing combination (1, 4).

>>> Processing Combination 9/15 (Remove Layers: (1, 5)) <<<

--- Pruning Layers [1, 5] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 2, 3, 4]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [1, 5] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (1, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.1622

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 274.68 ms
Results stored for combination (1, 5). Time taken: 34.51s
Cleaned up memory after processing combination (1, 5).

>>> Processing Combination 10/15 (Remove Layers: (2, 3)) <<<

--- Pruning Layers [2, 3] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 4, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [2, 3] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (2, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 5.2826

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 285.33 ms
Results stored for combination (2, 3). Time taken: 33.94s
Cleaned up memory after processing combination (2, 3).

>>> Processing Combination 11/15 (Remove Layers: (2, 4)) <<<

--- Pruning Layers [2, 4] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 3, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [2, 4] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (2, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 5.7645

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 285.67 ms
Results stored for combination (2, 4). Time taken: 39.95s
Cleaned up memory after processing combination (2, 4).

>>> Processing Combination 12/15 (Remove Layers: (2, 5)) <<<

--- Pruning Layers [2, 5] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 3, 4]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [2, 5] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (2, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.3204

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 286.56 ms
Results stored for combination (2, 5). Time taken: 34.56s
Cleaned up memory after processing combination (2, 5).

>>> Processing Combination 13/15 (Remove Layers: (3, 4)) <<<

--- Pruning Layers [3, 4] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 2, 5]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [3, 4] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.4847

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 235.82 ms
Results stored for combination (3, 4). Time taken: 32.89s
Cleaned up memory after processing combination (3, 4).

>>> Processing Combination 14/15 (Remove Layers: (3, 5)) <<<

--- Pruning Layers [3, 5] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 2, 4]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [3, 5] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.2863

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 326.13 ms
Results stored for combination (3, 5). Time taken: 35.06s
Cleaned up memory after processing combination (3, 5).

>>> Processing Combination 15/15 (Remove Layers: (4, 5)) <<<

--- Pruning Layers [4, 5] ---
Original number of layers: 6
Number of layers to remove: 2
Number of layers to keep: 4
Indices of layers KEPT: [0, 1, 2, 3]
Updated model config 'n_layer' to: 4
Pruned Model (Layers [4, 5] removed):
 - Layers remaining: 4
 - Total parameters: 67.74 M
---------------------------

--- Evaluating Model with Layers (4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.8638

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 311.97 ms
Results stored for combination (4, 5). Time taken: 43.64s
Cleaned up memory after processing combination (4, 5).

===== Evaluating Combinations with 3 Layers Removed =====

>>> Processing Combination 1/20 (Remove Layers: (0, 1, 2)) <<<

--- Pruning Layers [0, 1, 2] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [3, 4, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 1, 2] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.1262

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 331.75 ms
Results stored for combination (0, 1, 2). Time taken: 32.51s
Cleaned up memory after processing combination (0, 1, 2).

>>> Processing Combination 2/20 (Remove Layers: (0, 1, 3)) <<<

--- Pruning Layers [0, 1, 3] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [2, 4, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 1, 3] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 1, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 12.4748

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 279.32 ms
Results stored for combination (0, 1, 3). Time taken: 31.96s
Cleaned up memory after processing combination (0, 1, 3).

>>> Processing Combination 3/20 (Remove Layers: (0, 1, 4)) <<<

--- Pruning Layers [0, 1, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [2, 3, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 1, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 1, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.4484

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 277.40 ms
Results stored for combination (0, 1, 4). Time taken: 31.21s
Cleaned up memory after processing combination (0, 1, 4).

>>> Processing Combination 4/20 (Remove Layers: (0, 1, 5)) <<<

--- Pruning Layers [0, 1, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [2, 3, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 1, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 1, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 17.1097

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 322.30 ms
Results stored for combination (0, 1, 5). Time taken: 30.51s
Cleaned up memory after processing combination (0, 1, 5).

>>> Processing Combination 5/20 (Remove Layers: (0, 2, 3)) <<<

--- Pruning Layers [0, 2, 3] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 4, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 2, 3] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 2, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.3387

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 286.08 ms
Results stored for combination (0, 2, 3). Time taken: 30.16s
Cleaned up memory after processing combination (0, 2, 3).

>>> Processing Combination 6/20 (Remove Layers: (0, 2, 4)) <<<

--- Pruning Layers [0, 2, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 3, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 2, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 2, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 8.7167

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 279.52 ms
Results stored for combination (0, 2, 4). Time taken: 32.28s
Cleaned up memory after processing combination (0, 2, 4).

>>> Processing Combination 7/20 (Remove Layers: (0, 2, 5)) <<<

--- Pruning Layers [0, 2, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 3, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 2, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 2, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.2803

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 312.97 ms
Results stored for combination (0, 2, 5). Time taken: 31.58s
Cleaned up memory after processing combination (0, 2, 5).

>>> Processing Combination 8/20 (Remove Layers: (0, 3, 4)) <<<

--- Pruning Layers [0, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 2, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 3, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.3313

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 282.95 ms
Results stored for combination (0, 3, 4). Time taken: 30.87s
Cleaned up memory after processing combination (0, 3, 4).

>>> Processing Combination 9/20 (Remove Layers: (0, 3, 5)) <<<

--- Pruning Layers [0, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 2, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 3, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.4094

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 280.63 ms
Results stored for combination (0, 3, 5). Time taken: 39.25s
Cleaned up memory after processing combination (0, 3, 5).

>>> Processing Combination 10/20 (Remove Layers: (0, 4, 5)) <<<

--- Pruning Layers [0, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [1, 2, 3]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [0, 4, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (0, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.3101

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 286.50 ms
Results stored for combination (0, 4, 5). Time taken: 31.07s
Cleaned up memory after processing combination (0, 4, 5).

>>> Processing Combination 11/20 (Remove Layers: (1, 2, 3)) <<<

--- Pruning Layers [1, 2, 3] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 4, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 2, 3] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 2, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.6385

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 322.36 ms
Results stored for combination (1, 2, 3). Time taken: 33.03s
Cleaned up memory after processing combination (1, 2, 3).

>>> Processing Combination 12/20 (Remove Layers: (1, 2, 4)) <<<

--- Pruning Layers [1, 2, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 3, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 2, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 2, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 6.8284

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 254.52 ms
Results stored for combination (1, 2, 4). Time taken: 30.58s
Cleaned up memory after processing combination (1, 2, 4).

>>> Processing Combination 13/20 (Remove Layers: (1, 2, 5)) <<<

--- Pruning Layers [1, 2, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 3, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 2, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 2, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 8.0092

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 269.73 ms
Results stored for combination (1, 2, 5). Time taken: 30.18s
Cleaned up memory after processing combination (1, 2, 5).

>>> Processing Combination 14/20 (Remove Layers: (1, 3, 4)) <<<

--- Pruning Layers [1, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 2, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 3, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.4482

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 284.88 ms
Results stored for combination (1, 3, 4). Time taken: 30.50s
Cleaned up memory after processing combination (1, 3, 4).

>>> Processing Combination 15/20 (Remove Layers: (1, 3, 5)) <<<

--- Pruning Layers [1, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 2, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 3, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.0170

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 251.35 ms
Results stored for combination (1, 3, 5). Time taken: 30.39s
Cleaned up memory after processing combination (1, 3, 5).

>>> Processing Combination 16/20 (Remove Layers: (1, 4, 5)) <<<

--- Pruning Layers [1, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 2, 3]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [1, 4, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (1, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 8.3521

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 335.26 ms
Results stored for combination (1, 4, 5). Time taken: 31.20s
Cleaned up memory after processing combination (1, 4, 5).

>>> Processing Combination 17/20 (Remove Layers: (2, 3, 4)) <<<

--- Pruning Layers [2, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 1, 5]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [2, 3, 4] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (2, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.3211

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 236.94 ms
Results stored for combination (2, 3, 4). Time taken: 31.01s
Cleaned up memory after processing combination (2, 3, 4).

>>> Processing Combination 18/20 (Remove Layers: (2, 3, 5)) <<<

--- Pruning Layers [2, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 1, 4]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [2, 3, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (2, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.1129

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 258.99 ms
Results stored for combination (2, 3, 5). Time taken: 31.25s
Cleaned up memory after processing combination (2, 3, 5).

>>> Processing Combination 19/20 (Remove Layers: (2, 4, 5)) <<<

--- Pruning Layers [2, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 1, 3]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [2, 4, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (2, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.7741

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 290.52 ms
Results stored for combination (2, 4, 5). Time taken: 30.43s
Cleaned up memory after processing combination (2, 4, 5).

>>> Processing Combination 20/20 (Remove Layers: (3, 4, 5)) <<<

--- Pruning Layers [3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 3
Number of layers to keep: 3
Indices of layers KEPT: [0, 1, 2]
Updated model config 'n_layer' to: 3
Pruned Model (Layers [3, 4, 5] removed):
 - Layers remaining: 3
 - Total parameters: 60.65 M
---------------------------

--- Evaluating Model with Layers (3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.8430

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 283.74 ms
Results stored for combination (3, 4, 5). Time taken: 30.69s
Cleaned up memory after processing combination (3, 4, 5).

===== Evaluating Combinations with 4 Layers Removed =====

>>> Processing Combination 1/15 (Remove Layers: (0, 1, 2, 3)) <<<

--- Pruning Layers [0, 1, 2, 3] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [4, 5]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 2, 3] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 3) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.3528

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 235.66 ms
Results stored for combination (0, 1, 2, 3). Time taken: 26.87s
Cleaned up memory after processing combination (0, 1, 2, 3).

>>> Processing Combination 2/15 (Remove Layers: (0, 1, 2, 4)) <<<

--- Pruning Layers [0, 1, 2, 4] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [3, 5]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 2, 4] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.1869

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 292.53 ms
Results stored for combination (0, 1, 2, 4). Time taken: 27.61s
Cleaned up memory after processing combination (0, 1, 2, 4).

>>> Processing Combination 3/15 (Remove Layers: (0, 1, 2, 5)) <<<

--- Pruning Layers [0, 1, 2, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [3, 4]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 2, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.6090

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 237.60 ms
Results stored for combination (0, 1, 2, 5). Time taken: 28.38s
Cleaned up memory after processing combination (0, 1, 2, 5).

>>> Processing Combination 4/15 (Remove Layers: (0, 1, 3, 4)) <<<

--- Pruning Layers [0, 1, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [2, 5]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 3, 4] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 8.9866

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 236.52 ms
Results stored for combination (0, 1, 3, 4). Time taken: 27.58s
Cleaned up memory after processing combination (0, 1, 3, 4).

>>> Processing Combination 5/15 (Remove Layers: (0, 1, 3, 5)) <<<

--- Pruning Layers [0, 1, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [2, 4]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 3, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 15.5091

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 285.75 ms
Results stored for combination (0, 1, 3, 5). Time taken: 27.22s
Cleaned up memory after processing combination (0, 1, 3, 5).

>>> Processing Combination 6/15 (Remove Layers: (0, 1, 4, 5)) <<<

--- Pruning Layers [0, 1, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [2, 3]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 1, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 1, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.3489

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 237.80 ms
Results stored for combination (0, 1, 4, 5). Time taken: 26.91s
Cleaned up memory after processing combination (0, 1, 4, 5).

>>> Processing Combination 7/15 (Remove Layers: (0, 2, 3, 4)) <<<

--- Pruning Layers [0, 2, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [1, 5]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 2, 3, 4] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 2, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 8.8172

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 235.60 ms
Results stored for combination (0, 2, 3, 4). Time taken: 28.28s
Cleaned up memory after processing combination (0, 2, 3, 4).

>>> Processing Combination 8/15 (Remove Layers: (0, 2, 3, 5)) <<<

--- Pruning Layers [0, 2, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [1, 4]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 2, 3, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 2, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.2290

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 235.37 ms
Results stored for combination (0, 2, 3, 5). Time taken: 32.98s
Cleaned up memory after processing combination (0, 2, 3, 5).

>>> Processing Combination 9/15 (Remove Layers: (0, 2, 4, 5)) <<<

--- Pruning Layers [0, 2, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [1, 3]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 2, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 2, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.3003

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 251.29 ms
Results stored for combination (0, 2, 4, 5). Time taken: 28.46s
Cleaned up memory after processing combination (0, 2, 4, 5).

>>> Processing Combination 10/15 (Remove Layers: (0, 3, 4, 5)) <<<

--- Pruning Layers [0, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [1, 2]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [0, 3, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (0, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.3235

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 236.33 ms
Results stored for combination (0, 3, 4, 5). Time taken: 27.48s
Cleaned up memory after processing combination (0, 3, 4, 5).

>>> Processing Combination 11/15 (Remove Layers: (1, 2, 3, 4)) <<<

--- Pruning Layers [1, 2, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [0, 5]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [1, 2, 3, 4] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (1, 2, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 7.4719

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 237.69 ms
Results stored for combination (1, 2, 3, 4). Time taken: 27.09s
Cleaned up memory after processing combination (1, 2, 3, 4).

>>> Processing Combination 12/15 (Remove Layers: (1, 2, 3, 5)) <<<

--- Pruning Layers [1, 2, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [0, 4]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [1, 2, 3, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (1, 2, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.0947

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 222.81 ms
Results stored for combination (1, 2, 3, 5). Time taken: 27.78s
Cleaned up memory after processing combination (1, 2, 3, 5).

>>> Processing Combination 13/15 (Remove Layers: (1, 2, 4, 5)) <<<

--- Pruning Layers [1, 2, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [0, 3]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [1, 2, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (1, 2, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.9134

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 232.16 ms
Results stored for combination (1, 2, 4, 5). Time taken: 27.52s
Cleaned up memory after processing combination (1, 2, 4, 5).

>>> Processing Combination 14/15 (Remove Layers: (1, 3, 4, 5)) <<<

--- Pruning Layers [1, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [0, 2]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [1, 3, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (1, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.6969

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 234.92 ms
Results stored for combination (1, 3, 4, 5). Time taken: 26.93s
Cleaned up memory after processing combination (1, 3, 4, 5).

>>> Processing Combination 15/15 (Remove Layers: (2, 3, 4, 5)) <<<

--- Pruning Layers [2, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 4
Number of layers to keep: 2
Indices of layers KEPT: [0, 1]
Updated model config 'n_layer' to: 2
Pruned Model (Layers [2, 3, 4, 5] removed):
 - Layers remaining: 2
 - Total parameters: 53.56 M
---------------------------

--- Evaluating Model with Layers (2, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.1196

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 237.16 ms
Results stored for combination (2, 3, 4, 5). Time taken: 27.05s
Cleaned up memory after processing combination (2, 3, 4, 5).

===== Evaluating Combinations with 5 Layers Removed =====

>>> Processing Combination 1/6 (Remove Layers: (0, 1, 2, 3, 4)) <<<

--- Pruning Layers [0, 1, 2, 3, 4] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [5]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [0, 1, 2, 3, 4] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 3, 4) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 11.0265

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 191.99 ms
Results stored for combination (0, 1, 2, 3, 4). Time taken: 23.07s
Cleaned up memory after processing combination (0, 1, 2, 3, 4).

>>> Processing Combination 2/6 (Remove Layers: (0, 1, 2, 3, 5)) <<<

--- Pruning Layers [0, 1, 2, 3, 5] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [4]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [0, 1, 2, 3, 5] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 3, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 14.3624

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 191.57 ms
Results stored for combination (0, 1, 2, 3, 5). Time taken: 23.46s
Cleaned up memory after processing combination (0, 1, 2, 3, 5).

>>> Processing Combination 3/6 (Remove Layers: (0, 1, 2, 4, 5)) <<<

--- Pruning Layers [0, 1, 2, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [3]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [0, 1, 2, 4, 5] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (0, 1, 2, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 13.4933

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 194.23 ms
Results stored for combination (0, 1, 2, 4, 5). Time taken: 23.51s
Cleaned up memory after processing combination (0, 1, 2, 4, 5).

>>> Processing Combination 4/6 (Remove Layers: (0, 1, 3, 4, 5)) <<<

--- Pruning Layers [0, 1, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [2]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [0, 1, 3, 4, 5] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (0, 1, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.7707

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 193.76 ms
Results stored for combination (0, 1, 3, 4, 5). Time taken: 23.66s
Cleaned up memory after processing combination (0, 1, 3, 4, 5).

>>> Processing Combination 5/6 (Remove Layers: (0, 2, 3, 4, 5)) <<<

--- Pruning Layers [0, 2, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [1]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [0, 2, 3, 4, 5] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (0, 2, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 9.1107

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 193.13 ms
Results stored for combination (0, 2, 3, 4, 5). Time taken: 23.73s
Cleaned up memory after processing combination (0, 2, 3, 4, 5).

>>> Processing Combination 6/6 (Remove Layers: (1, 2, 3, 4, 5)) <<<

--- Pruning Layers [1, 2, 3, 4, 5] ---
Original number of layers: 6
Number of layers to remove: 5
Number of layers to keep: 1
Indices of layers KEPT: [0]
Updated model config 'n_layer' to: 1
Pruned Model (Layers [1, 2, 3, 4, 5] removed):
 - Layers remaining: 1
 - Total parameters: 46.47 M
---------------------------

--- Evaluating Model with Layers (1, 2, 3, 4, 5) Removed ---

Calculating perplexity for model on 'wikitext/wikitext-2-raw-v1' [test] split...
Loading dataset...
Dataset loaded.
Tokenizing dataset...
Tokenization complete.
Processing sequence of length 287644 with stride 512 and max_length 1024...


  0%|          | 0/562 [00:00<?, ?it/s]

Calculation complete. Mean NLL: 10.8397

Measuring inference speed (average over 10 runs)...
Performing warm-up run...
Starting timed runs...


  0%|          | 0/10 [00:00<?, ?it/s]

Speed test complete. Average time: 201.54 ms
Results stored for combination (1, 2, 3, 4, 5). Time taken: 23.81s
Cleaned up memory after processing combination (1, 2, 3, 4, 5).


--- Combinatorial Pruning Evaluation Summary ---
Total evaluation time: 28.99 minutes
                   Layer Removed  Layers  Perplexity (PPL)  \
Layers Removed                                               
(1, 3)                       NaN       4            156.42   
(1, 4)                       NaN       4            187.22   
(2, 3)                       NaN       4            196.88   
(2, 4)                       NaN       4            318.77   
(1, 2)                       NaN       4            382.62   
(3, 5)                       NaN       4            537.18   
(2, 5)                       NaN       4            555.80   
(3, 4)                       NaN       4            655.05   
(1, 5)                       NaN       4          1,289.78   
(4, 5)                       NaN       4          2,60

KeyError: "['None (Original)'] not found in axis"