In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import torch
import torch.nn as nn
import os
import time

In [2]:
pretrained_model_dir = "facebook/opt-2.7b"
quantized_model_dir = "facebook/opt-2.7b-4bit-128g"

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
)

max_memory={0: "11GiB"}

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

!nvidia-smi

Mon May 15 03:53:17 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060         On | 00000000:01:00.0 Off |                  N/A |
| 44%   51C    P8               20W / 170W|      3MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)

In [5]:
import spacy


nlp = spacy.load("en_core_web_md")


text = """Yaccarino was earlier the advertising chief for Comcast Corp's NBCUniversal

Newly appointed Twitter CEO Linda Yaccarino said in a tweet on Saturday that she has been inspired by owner Elon Musk's vision to create a brighter future and is excited to help to transform the social media platform.

It was the first time Yaccarino has spoken publicly since the news broke Thursday that she was in talks to become the next CEO of Twitter. Musk, who has served as CEO since his $44 billion buyout of Twitter last October, announced her appointment on Friday.

"I've long been inspired by [Musk's] vision to create a brighter future. I'm excited to help bring this vision to Twitter and transform this business together!" Yaccarino tweeted.

    Thank you @elonmusk!

    I've long been inspired by your vision to create a brighter future. I'm excited to help bring this vision to Twitter and transform this business together! https://t.co/BcvySu7K76
    — Linda Yaccarino (@lindayacc) May 13, 2023

Yaccarino, who as advertising chief for Comcast Corp's NBCUniversal spent several years modernising its ad business, said she is committed to Twitter's future, and said user feedback is vital to build Twitter 2.0.

Yaccarino will take over a social media platform that has been trying to reverse a plunge in ad revenue and is beset with challenges, along with a heavy debt load.

Since Musk acquired Twitter, advertisers have fled the platform, worried that their ads could appear next to inappropriate content after the company lost nearly 80% of staff. Musk earlier this year acknowledged that Twitter had suffered a massive decline in ad revenue.

While Musk said Yaccarino would help build an "everything app," which he has previously said could offer a variety of services such as peer-to-peer payments, his selection of an advertising veteran signalled that digital ads would continue to be a core focus of the business.

Musk has long said he intended to find a new leader for Twitter.

Musk, who is also the CEO of electric-vehicle maker Tesla Inc , on Friday said that bringing Yaccarino on as Twitter's new chief will help him devote more time to Tesla.

(This story has not been edited by NDTV staff and is auto-generated from a syndicated feed.)"""

doc = nlp(text)

examples = [tokenizer(sent.text) for sent in doc.sents]


In [6]:
if not os.path.exists(quantized_model_dir):
    start = time.time()

    # Start tracking GPU memory usage
    torch.cuda.reset_max_memory_allocated()

    # load un-quantized model, by default, the model will always be loaded into CPU memory
    model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config, max_memory=max_memory)

    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
    model.quantize(examples, use_triton=False)

    # save quantized model
    model.save_quantized(quantized_model_dir)

    # save quantized model using safetensors
    model.save_quantized(quantized_model_dir, use_safetensors=True)

    # Get the maximum GPU memory allocated
    max_memory_used = torch.cuda.max_memory_allocated(device) / 1024**3  # Convert to gigabytes

    end = time.time()

    print(f"Maximum GPU memory used: {max_memory_used:.2f} GB; time taken: {end-start:.2f}s")

    # Clear GPU memory
    model.cpu()
    del model
    torch.cuda.empty_cache()



Maximum GPU memory used: 6.59 GB; time taken: 284.97s


In [7]:
import datasets
from auto_gptq.eval_tasks import SequenceClassificationTask
from functools import partial


In [8]:
DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
ID2LABEL = {
    0: "negative",
    1: "neutral",
    2: "positive"
}
LABELS = list(ID2LABEL.values())


def ds_refactor_fn(samples):
    text_data = samples["text"]
    label_data = samples["label"]

    new_samples = {"prompt": [], "label": []}
    for text, label in zip(text_data, label_data):
        prompt = TEMPLATE.format(labels=LABELS, text=text)
        new_samples["prompt"].append(prompt)
        new_samples["label"].append(ID2LABEL[label])

    return new_samples


class IdentityModel(nn.Module):
    def __init__(self):
        super(IdentityModel, self).__init__()
        self.device = device

    def forward(self, x):
        return x
    

task = SequenceClassificationTask(
        model=IdentityModel(),
        tokenizer=tokenizer,
        classes=LABELS,
        data_name_or_path=DATASET,
        prompt_col_name="prompt",
        label_col_name="label",
        **{
            "num_samples": 1000,  # how many samples will be sampled to evaluation
            "sample_max_len": 1024,  # max tokens for each sample
            "block_max_len": 2048,  # max tokens for each data block
            # function to load dataset, one must only accept data_name_or_path as input 
            # and return datasets.Dataset
            "load_fn": partial(datasets.load_dataset, name="english"),  
            # function to preprocess dataset, which is used for datasets.Dataset.map, 
            # must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
            "preprocess_fn": ds_refactor_fn,  
            # truncate label when sample's length exceed sample_max_len
            "truncate_prompt": False  
        }
    )

Found cached dataset tweet_sentiment_multilingual (/home/abhinav/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/870 [00:00<?, ? examples/s]

In [9]:
model_fns = [
    lambda: AutoModelForCausalLM.from_pretrained(pretrained_model_dir, torch_dtype=torch.float16),
    lambda: AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, BaseQuantizeConfig(), max_memory=max_memory),
    lambda: AutoGPTQForCausalLM.from_quantized(quantized_model_dir, use_triton=False, max_memory=max_memory)
]

model_types = [
    f"AutoModelForCausalLM {pretrained_model_dir}",
    f"AutoGPTQForCausalLM {pretrained_model_dir}",
    f"AutoGPTQForCausalLM {quantized_model_dir}"
]

for model_fn, model_type in zip(model_fns, model_types):
    # Start tracking GPU memory usage
    torch.cuda.reset_max_memory_allocated()

    # Load the model
    model = model_fn()
    model.to(device)

    # Run the evaluation task
    task.model = model
    start = time.time()
    result = task.run()
    end = time.time()

    # Get the maximum GPU memory allocated
    max_memory_used = torch.cuda.max_memory_allocated(device) / 1024**3  # Convert to gigabytes

    print(f"Maximum GPU memory used: {max_memory_used:.2f} GB; eval result for {model_type} model: {result}; time taken: {time.time()-start:.2f}s")

    task.model = None
    model.cpu()
    del model
    torch.cuda.empty_cache()

Maximum GPU memory used: 6.55 GB; eval result for AutoModelForCausalLM facebook/opt-2.7b model: {'acc': 0.3528735632183908}; time taken: 23.46s
Maximum GPU memory used: 6.79 GB; eval result for AutoGPTQForCausalLM facebook/opt-2.7b model: {'acc': 0.3563218390804598}; time taken: 37.36s
Maximum GPU memory used: 3.08 GB; eval result for AutoGPTQForCausalLM facebook/opt-2.7b-4bit-128g model: {'acc': 0.32068965517241377}; time taken: 76.30s
