
# Install required libraries for Langchain, Hugging Face models, and related dependencies.
langchain: Core Langchain library for building LLM applications.

langchain-huggingface: Integration for Hugging Face models within Langchain.

transformers: Hugging Face's library for accessing pre-trained models.

accelerate: Library to help speed up training and inference with large models.

bitsandbytes: Library for quantization, reducing model size and potentially speeding up inference.

huggingface_hub: Library for interacting with the Hugging Face Hub.

In [None]:
!pip install langchain langchain-huggingface transformers accelerate bitsandbytes huggingface_hub

In [12]:
# Retrieve the API key
from google.colab import userdata
key = userdata.get('HF_llm_rag')

In [14]:
Set the Hugging Face API token as an environment variable for authentication.
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = key

In [15]:
# Import the HuggingFaceEndpoint class from langchain_huggingface to use models via the Inference API.
from langchain_huggingface import HuggingFaceEndpoint
repo_id = "HuggingFaceH4/zephyr-7b-beta"

# Initialize the HuggingFaceEndpoint with specified parameters.
# repo_id: The ID of the model to use.
# max_new_tokens: The maximum number of new tokens to generate in the response.
# temperature: Controls the randomness of the output (higher means more random).
# huggingfacehub_api_token: The API token for authentication.
# task: The task the model is intended for (text generation in this case).

llm = HuggingFaceEndpoint(repo_id=repo_id,
                          max_new_tokens=256,
                          temperature=0.7,
                          huggingfacehub_api_token=key,
                          task = 'text-genration')

In [None]:
llm.invoke('What is machine learning?')

In [17]:
from transformers import BitsAndBytesConfig
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

repo_id = "HuggingFaceH4/zephyr-7b-beta"

# Configure BitsAndBytes quantization.
# load_in_4bit: Load the model weights in 4-bit precision.
# bnb_4bit_quant_type: Specify the quantization type (nf4 is a common type).
# bnb_4bit_compute_dtype: The data type to use for computations (float16 is common).
# bnb_4bit_use_double_quant: Whether to use double quantization.

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                   bnb_4bit_quant_type="nf4",
                   bnb_4bit_compute_dtype="float16",
                   bnb_4bit_use_double_quant=True)

In [None]:

# Initialize a HuggingFacePipeline from the model ID, enabling local loading and quantization.
# model_id: The ID of the model to use.
# model_kwargs: Additional arguments for loading the model, including the quantization config.
# task: The task for the pipeline (text generation).
# pipeline_kwargs: Arguments for the text generation pipeline.
# max_new_tokens: Max tokens to generate.
# do_sample: Whether to use sampling (False means greedy decoding).
# repetition_penalty: Penalty for repeating tokens.
# return_full_text: Whether to return the full text including the prompt.


llm2 = HuggingFacePipeline.from_model_id(
    model_id=repo_id,
    model_kwargs={"quantization_config": quantization_config},
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 256,
                     "do_sample":False,
                      "repetition_penalty":1.03,
                      "return_full_text":False,}
)

In [None]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="What happens when an unstoppable force meets an immovable object?"
    ),
]

In [None]:
ChatHuggingFace(llm=llm2, model_id=repo_id).invoke(messages)