In [2]:
from transformers import pipeline
import numpy as np

classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9598048329353333}]

In [3]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598048329353333},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [6]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445994853973389, 0.11197401583194733, 0.04342653974890709]}

In [7]:
from transformers import pipeline

generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to install, configure and run our app at your local machine.\n\nWe will learn how to use custom frameworks in Android apps, with some tips on how to add frameworks to your.NET-based'}]

# Quantization

In [19]:
# Importing the pipeline
from accelerate import init_empty_weights
from mingpt.model import GPT

# Create a GPT model with the default configuration, no weights
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

with init_empty_weights():
    empty_model = GPT(model_config)
    
# Load the weights from the Hugging Face Hub
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="marcsun13/gpt2-xl-linear-sharded")

# Define Quantization Config
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold = 6)

# Load and quantize the model
from accelerate.utils import load_and_quantize_model
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map = "auto")

# create tokenizer
from mingpt.bpe import BPETokenizer
prompt = "Hello my name is"
tokenizer = BPETokenizer()
x1 = tokenizer(prompt).to(0)

# Generate text with the quantized model
quantized_model.eval()
outputs = quantized_model.generate(x1, max_new_tokens=10, do_sample=False)[0]
print(tokenizer.decode(outputs.cpu().squeeze()))

number of parameters: 1557.61M


Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 10410.57it/s]


Hello my name is John Doe, and I am a member of the


In [24]:
import torch

# Importing the pipeline
from accelerate import init_empty_weights
from mingpt.model import GPT

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a GPT model with the default configuration, no weights
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024

activations = {}
weights = {}

def get_activation(name):
    def hook(model, input, output):
        activations[name] = output.detach().to(device)
    return hook

def get_weight(name):
    def hook(model, input, output):
        if hasattr(model, 'weight'):
            weights[name] = model.weight.detach().to(device)
    return hook

with init_empty_weights():
    empty_model = GPT(model_config)

# Load the weights from the Hugging Face Hub
from huggingface_hub import snapshot_download
weights_location = snapshot_download(repo_id="marcsun13/gpt2-xl-linear-sharded")

# Define Quantization Config
from accelerate.utils import BnbQuantizationConfig
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold=6)

# Load and quantize the model
from accelerate.utils import load_and_quantize_model
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, bnb_quantization_config=bnb_quantization_config, device_map="auto")

# Move model to device
quantized_model.to(device)

# Register hooks for each layer of interest on the quantized model
for name, module in quantized_model.named_modules():
    module.register_forward_hook(get_activation(name))
    if hasattr(module, 'weight'):
        module.register_forward_hook(get_weight(name))

# Create tokenizer
from mingpt.bpe import BPETokenizer
prompt = "Hello my name is"
tokenizer = BPETokenizer()
x1 = tokenizer(prompt).to(device)

# Generate text with the quantized model
quantized_model.eval()
outputs = quantized_model.generate(x1, max_new_tokens=10, do_sample=False)[0]
print(tokenizer.decode(outputs.cpu().squeeze()))

# Print captured activations and weights
for name, activation in activations.items():
    print(f"Activation {name}: {activation.shape}")
    
for name, weight in weights.items():
    print(f"Weight {name}: {weight.shape}")


number of parameters: 1557.61M


Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 42130.29it/s]
Some parameters are on the meta device device because they were offloaded to the cpu.
You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)