In [30]:
import os
import torch
import pandas as pd
from IPython.display import display
from transformers import RobertaModel, DistilBertModel, AutoModel

In [31]:
# Checking if GPU is available
print(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

cuda


In [32]:
def load_model(model_type, model_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    if model_path and os.path.exists(model_path):
        # Try loading as a generic PyTorch model
        try:
            model = torch.load(model_path, map_location=device)
        except Exception as e:
            raise IOError(f"Error loading model from {model_path}: {e}")
    elif model_type == 'distilroberta-base':
        model = DistilBertModel.from_pretrained('distilroberta-base')
    elif model_type == 'roberta-base':
        model = RobertaModel.from_pretrained('roberta-base')
    else:
        raise ValueError("Invalid model type or path")

    return model.to(device)

def check_sparsity(model):
    total_params = 0
    nonzero_params = 0
    layer_sparsity = {}
    for name, param in model.named_parameters():
        if not param.requires_grad:  # exclude non-trainable parameters
            continue
        layer_size = param.numel()
        layer_nonzero = torch.count_nonzero(param)
        layer_sparsity[name] = 1 - layer_nonzero.item() / layer_size
        total_params += layer_size
        nonzero_params += layer_nonzero.item()
    overall_sparsity = 1 - nonzero_params / total_params
    print(f"Overall Sparsity: {overall_sparsity:.4%}")
    layer_sparsity_df = pd.DataFrame(layer_sparsity.items(), columns=['Layer Name', 'Sparsity'])
    # layer_sparsity_df.sort_values(by='Sparsity', ascending=False, inplace=True)
    display(layer_sparsity_df)

### Layer-wise Pruning: Approach 1

This approach calculates a threshold for each layer individually. It views the weights of a single layer as a flattened tensor, splits them into batches, and computes the quantile for each batch based on the pruning rate. The mean of these batched quantiles is used as the threshold for pruning the layer. This method applies the computed threshold to the layer's parameters, pruning weights below the threshold.

In [33]:
def approach1_threshold(param, pruning_rate, batch_size, device):
    weights = param.view(-1)
    batched_quantiles = [torch.quantile(batch.abs(), pruning_rate) for batch in weights.split(batch_size)]
    return torch.tensor(batched_quantiles).mean().to(device)

def mpruner_approach1(model, pruning_rate, batch_size, device):
    for name, param in model.named_parameters():
        if param.requires_grad and param.dim() > 1:
            layer_threshold = approach1_threshold(param, pruning_rate, batch_size, device)
            # print(f"Layer: {name}, Threshold: {layer_threshold}")
            mask = param.abs() > layer_threshold
            param.data.mul_(mask.to(torch.float32))
    return model

### Layer-wise Pruning: Approach 2

This method groups parameters by layer names and concatenates all the parameters of a layer before computing the threshold. It calculates a single threshold for each layer by considering all its parameters together, and then applies this threshold across all parameters of that layer.

In [34]:
def approach2_threshold(layer_params, pruning_rate, batch_size, device):
    all_weights = [param.view(-1) for param in layer_params]
    all_weights = torch.cat(all_weights).to(device)
    batched_quantiles = [torch.quantile(batch.abs(), pruning_rate) for batch in all_weights.split(batch_size)]
    return torch.tensor(batched_quantiles).mean().to(device)

def mpruner_approach2(model, pruning_rate, batch_size, device):
    layer_params = {}
    for name, param in model.named_parameters():
        if param.requires_grad and param.dim() > 1:
            # Assuming layer name is the first part of the parameter name
            layer_name = name.split('.')[0]
            if layer_name not in layer_params:
                layer_params[layer_name] = []
            layer_params[layer_name].append(param)
    for layer_name, params in layer_params.items():
        layer_threshold = approach2_threshold(params, pruning_rate, batch_size, device)
        # print(f"Layer: {layer_name}, Threshold: {layer_threshold}")
        for param in params:
            mask = param.abs() > layer_threshold
            param.data.mul_(mask.to(torch.float32))
    return model

### Layer-wise Pruning: Approach 3

Approach 3 categorizes layers into types (e.g., embedding, attention, etc.) and computes a threshold for each category. It aggregates all weights of a given type across the model, computes the batched quantiles, and uses the maximum mean quantile as the threshold for that category. This method prunes parameters based on their layer type classification. This is a rather crude way of doing things, as categories could be ill-defined atm.

In [44]:
def approach3_categorize(name):
    if 'embeddings' in name:
        return 'Embedding Layers'
    elif 'attention.self.query' in name or 'attention.self.key' in name or 'attention.self.value' in name:
        return 'Attention Layers'
    elif 'attention.output' in name:
        return 'Attention Output Layers'
    elif 'intermediate' in name:
        return 'Intermediate Layers'
    elif 'output' in name:
        return 'Encoder Output Layers'
    elif 'pooler' in name:
        return 'Pooling Layer'
    elif 'LayerNorm' in name:
        return 'Layer Normalizations'
    elif 'dropout' in name:
        return 'Dropout Layers'
    else:
        return 'Other'

def approach3_threshold(model, pruning_rate, batch_size, device):
    thresholds = {}
    for name, param in model.named_parameters():
        if param.requires_grad and param.dim() > 1:
            layer_type = approach3_categorize(name)
            all_weights = param.view(-1).abs().to(device)
            batched_quantiles = [torch.quantile(batch, pruning_rate) for batch in all_weights.split(batch_size)]
            layer_threshold = torch.tensor(batched_quantiles).mean().item()
            if layer_type not in thresholds or thresholds[layer_type] < layer_threshold:
                thresholds[layer_type] = layer_threshold
    return thresholds

def mpruner_approach3(model, pruning_rate, batch_size, device):
    thresholds = approach3_threshold(model, pruning_rate, batch_size, device)
    for name, param in model.named_parameters():
        if param.requires_grad and param.dim() > 1:
            layer_type = approach3_categorize(name)
            threshold = thresholds.get(layer_type, 0)
            mask = param.abs() > threshold
            param.data.mul_(mask.to(torch.float32))
    return model

In [36]:
model_type = 'roberta-base'     # Can be 'roberta-base', 'distilroberta-base', or a custom model path
model_path = None               # Set this to None if you want to use pre-trained models

# Loading model
model = load_model(model_type, model_path)
print(type(model))

Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<class 'transformers.models.roberta.modeling_roberta.RobertaModel'>


In [48]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [37]:
# Checking sparsity before pruning
layer_sparsity = check_sparsity(model)

Overall Sparsity: 0.0019%


In [38]:
# Pruning model
pruning_rate = 0.2              # Between 0 and 1
batch_size = 5000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
pruned_model_1 = mpruner_approach1(model, pruning_rate, batch_size, device)

In [40]:
# Checking sparsity after pruning
check_sparsity(pruned_model_1)

Overall Sparsity: 20.4159%


Unnamed: 0,Layer Name,Sparsity
0,embeddings.word_embeddings.weight,0.202470
1,embeddings.position_embeddings.weight,0.200569
2,embeddings.token_type_embeddings.weight,1.000000
3,embeddings.LayerNorm.weight,0.000000
4,embeddings.LayerNorm.bias,0.000000
...,...,...
194,encoder.layer.11.output.dense.bias,0.000000
195,encoder.layer.11.output.LayerNorm.weight,0.000000
196,encoder.layer.11.output.LayerNorm.bias,0.000000
197,pooler.dense.weight,0.200075


In [41]:
pruned_model_2 = mpruner_approach2(model, pruning_rate, batch_size, device)

Layer: embeddings, Threshold: 0.0174578744918108
Layer: encoder, Threshold: 0.0063947513699531555
Layer: pooler, Threshold: 0.0022366500925272703


In [42]:
# Checking sparsity after pruning
check_sparsity(pruned_model_2)

Overall Sparsity: 20.4493%


Unnamed: 0,Layer Name,Sparsity
0,embeddings.word_embeddings.weight,0.202470
1,embeddings.position_embeddings.weight,0.306283
2,embeddings.token_type_embeddings.weight,1.000000
3,embeddings.LayerNorm.weight,0.000000
4,embeddings.LayerNorm.bias,0.000000
...,...,...
194,encoder.layer.11.output.dense.bias,0.000000
195,encoder.layer.11.output.LayerNorm.weight,0.000000
196,encoder.layer.11.output.LayerNorm.bias,0.000000
197,pooler.dense.weight,0.200075


In [45]:
pruned_model_3 = mpruner_approach3(model, pruning_rate, batch_size, device)

In [46]:
# Checking sparsity after pruning
check_sparsity(pruned_model_3)

Overall Sparsity: 20.8639%


Unnamed: 0,Layer Name,Sparsity
0,embeddings.word_embeddings.weight,0.202470
1,embeddings.position_embeddings.weight,0.308107
2,embeddings.token_type_embeddings.weight,1.000000
3,embeddings.LayerNorm.weight,0.000000
4,embeddings.LayerNorm.bias,0.000000
...,...,...
194,encoder.layer.11.output.dense.bias,0.000000
195,encoder.layer.11.output.LayerNorm.weight,0.000000
196,encoder.layer.11.output.LayerNorm.bias,0.000000
197,pooler.dense.weight,0.200075


In [8]:
# Saving the model
filename = f"{model_type}-mpruned-layerwise-base-{pruning_rate:.2f}.pt"
print(f"Saving model to {filename}")
torch.save(pruned_model_1, filename)
torch.cuda.empty_cache()

Saving model to roberta-base-mpruned-layerwise-base-0.20.pt
