In [None]:
# %pip install python-dotenv
# %pip install git+https://github.com/huggingface/transformers
# %pip install -U accelerate

In [1]:
# model_id = "TheBloke/Llama-2-7B-Chat-GGUF"
from torch import cuda, bfloat16
import transformers
import os
import dotenv

_ = dotenv.load_dotenv()

model_id = "meta-llama/Llama-2-70b-chat-hf"

device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"

device

'cuda:0'

In [2]:
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16,
)

# begin initializing HF items, need auth token for these
hf_auth = os.getenv("HUGGINGFACE_TOKEN")
# model_config = transformers.AutoConfig.from_pretrained(
#     model_id, token=hf_auth, cache_dir="./models/Llama-2-7b-chat-hf/"
# )

model = transformers.AutoModelForCausalLM.from_pretrained(
    "./models/Llama-2-7b-chat-hf/",
    trust_remote_code=True,
    # config=model_config,
    # quantization_config=bnb_config,
    device_map="auto",
    token=hf_auth,
)
model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model loaded on cuda:0


In [7]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    kwargs=

)

llm = HuggingFacePipeline(pipeline=pipeline)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7fdf2f7d0d10>)

In [8]:
llm.invoke("What did foo say about bar?")

KeyboardInterrupt: 

In [None]:
from langchain.prompts import PromptTemplate

template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)

chain = prompt | llm

question = "What is electroencephalography?"

print(chain.invoke({"question": question}))

In [6]:
%pip show accelerate

Name: accelerate
Version: 0.26.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /home/darklord/miniconda3/envs/torch/lib/python3.11/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Note: you may need to restart the kernel to use updated packages.
