# Quantize huggingface transformers

In [2]:
import os
import re
import pandas as pd
import numpy as np
from sklearn import preprocessing

from transformers import AutoTokenizer
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import torch
from torch.utils.checkpoint import checkpoint
import torch.nn as nn
import torch.nn.functional as F

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'



## Define configuration

In [3]:
# You can change the model name here, look up model names from huggingface docs
model_name = "distilbert-base-uncased"

## Prepare Data

In [4]:
from datasets import load_dataset

dataset = load_dataset("tweet_eval", "emotion") # use tweet_eval dataset
dataset

Downloading builder script:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/emotion (download: 472.47 KiB, generated: 511.52 KiB, post-processed: Unknown size, total: 984.00 KiB) to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Create tokenized dataset

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_train_dataset = dataset["train"].shuffle(seed=42).map(tokenize_function, batched=True)
tokenized_test_dataset = dataset["test"].map(tokenize_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [7]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3257
})

## Define Dynamic padding

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
total_number_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_number_params

66956548

In [11]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Define Training Arguments

In [12]:
def f_train_args(runname):
    training_args = TrainingArguments(
        run_name=runname,
        output_dir="./results",
        num_train_epochs=2,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1, 
        lr_scheduler_type='cosine',
        # Optimising
        auto_find_batch_size=True,
        # The num of workers may vary for different machines, if you are not sure, just comment this line out
        dataloader_num_workers=4,
        gradient_accumulation_steps=4,
        fp16=True,
    )
    return training_args

In [13]:
from transformers import EvalPrediction
from typing import Dict
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.nn.functional import cross_entropy

def custom_compute_metrics(res) -> Dict:
    pred = res.predictions.argmax(axis=1)
    target = res.label_ids
    precision = precision_score(target, pred, average='macro')
    recall = recall_score(target, pred, average='macro')
    f1 = f1_score(target, pred, average='macro')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs['logits']
        
        loss = cross_entropy(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## Define Trainer

In [14]:
trainer = CustomTrainer(
    model=model,
    args=f_train_args(model_name),
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Train the model

In [15]:
trainer.train()
# save parameters
torch.save(model.state_dict(), 'model.pth')

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.564525,0.779583,0.742608,0.756929
2,No log,0.542564,0.773946,0.757207,0.764197


In [16]:
trainer.evaluate()

{'eval_loss': 0.5425638556480408,
 'eval_precision': 0.7739463369644577,
 'eval_recall': 0.7572065918258581,
 'eval_f1': 0.7641968815279021,
 'eval_runtime': 2.1053,
 'eval_samples_per_second': 674.968,
 'eval_steps_per_second': 84.549,
 'epoch': 2.0}

# Swap layers

In [18]:
class TorchQuantize(nn.Module):
    """ Quantize a tensor to a bitwidth larger than 1 """
    def __init__(self, bits=2):
        super(TorchQuantize, self).__init__()
        self.bits = bits
        self.epsilon = 1e-7

    def forward(self, x):
        """ extract the sign of each element """
        sign = torch.sign(x).detach()
        
        """ get the mantessa bits """
        x_pos = torch.abs(x)
        scaling = torch.max(x_pos).detach() + self.epsilon
        x_pos = torch.clamp(x_pos / scaling, 0, 1)
        
        """ round the mantessa bits to the required precision """
        x_pos = torch.round(x_pos * (2.0 ** self.bits - 1.0)) / (2.0 ** self.bits - 1.0)
        
        return x_pos * sign * scaling

In [19]:
class QuantizedLinear(nn.Linear):
    """ 
    A fully connected layer with its weight tensor and input tensor quantized. 
    """
    def __init__(self, in_features, out_features, bias=True, wbits=0, abits=0):
        super(QuantizedLinear, self).__init__(in_features, out_features, bias)
        self.quantize_w = TorchQuantize(wbits)
        self.quantize_a = TorchQuantize(abits)
        

    def forward(self, x):
        """ 
        1. Quantize the input tensor
        2. Quantize the weight tensor
        3. perform matrix multiplication 
        """
        
        return F.linear(
                self.quantize_a(x), 
                self.quantize_w(self.weight), 
                self.bias)

In [20]:
def replace_layer(module, name, k=8):
    '''
    Replace linear layer to quantized layer
    '''
    # go through all attributes of module nn.module (e.g. network or layer) and put batch norms if present
    for attr_str in dir(module):
        target_attr = getattr(module, attr_str)
        if type(target_attr) == torch.nn.Linear:
            new = QuantizedLinear(target_attr.in_features, target_attr.out_features, bias=True, 
                                     wbits=k, abits=k)
            setattr(module, attr_str, new)
            
    for name, immediate_child_module in module.named_children():
        replace_layer(immediate_child_module, name, k)


In [40]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
replace_layer(model, 'model', k=4) # quantize
model.cuda()
model.load_state_dict(torch.load('model.pth'))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [41]:
trainer = CustomTrainer(
    model=model,
    args=f_train_args(model_name),
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [42]:
trainer.evaluate()

{'eval_loss': 1.350499153137207,
 'eval_precision': 0.6069754265636477,
 'eval_recall': 0.35861143789820116,
 'eval_f1': 0.30798124802618415,
 'eval_runtime': 4.5235,
 'eval_samples_per_second': 314.135,
 'eval_steps_per_second': 39.35}

In [26]:
results = dict()

for k in [4, 5, 6, 7, 8]:
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
    replace_layer(model, 'model', k=k) # quantize
    model.cuda()
    model.load_state_dict(torch.load('model.pth'))

    trainer = CustomTrainer(
        model=model,
        args=f_train_args(model_name),
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=custom_compute_metrics,
    )

    results[k] = trainer.evaluate()['eval_f1']

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [27]:
for k in [4, 5, 6, 7, 8]:
    print(k, results[k])

4 0.28936695146878477
5 0.36912546409110236
6 0.6921907190016314
7 0.7467967537331747
8 0.7648068924043765


# Train quantized network

In [28]:
for k in [4, 5, 6, 7, 8]:
    training_args = f_train_args("tweet_distilbert-base-k{}".format(k))

    # load parameters
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
    replace_layer(model, 'model', k=k) # quantize
    model.cuda()
    model.load_state_dict(torch.load('model.pth'))

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=custom_compute_metrics,
    )

    # Train loop
    trainer.train()
    torch.save(model.state_dict(), 'model_quant_k{}.pth'.format(k))

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.339682,0.333203,0.34165,0.288238
2,No log,1.339414,0.336921,0.339665,0.288277


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.157594,0.464906,0.424846,0.370887
2,No log,1.157389,0.465373,0.42525,0.371894


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.772099,0.740728,0.677917,0.69176
2,No log,0.772107,0.740965,0.678365,0.692233


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.56717,0.786092,0.729275,0.746797
2,No log,0.567167,0.786092,0.729275,0.746797


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.544376,0.766521,0.764279,0.764807
2,No log,0.544354,0.766521,0.764279,0.764807


In [29]:
results = dict()

for k in [4, 5, 6, 7, 8]:
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
    replace_layer(model, 'model', k=k) # quantize
    model.cuda()
    model.load_state_dict(torch.load('model_quant_k{}.pth'.format(k)))

    trainer = CustomTrainer(
        model=model,
        args=f_train_args(model_name),
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=custom_compute_metrics,
    )

    results[k] = trainer.evaluate()['eval_f1']

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
for k in [4, 5, 6, 7, 8]:
    print(k, results[k])

4 0.2882769322859439
5 0.3718944964689227
6 0.6922333341599276
7 0.7467967537331747
8 0.7648068924043765
