In [1]:
# Set up environment
from dotenv import load_dotenv
import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)
"""
requirements: text-generation bitsandbytes
"""

# Ensure HF token is set in environment vars
load_dotenv('.env')
print("..." + os.environ["HUGGINGFACE_API_TOKEN"][-4:])

...qOCV


## Instantiate from Endpoint

In [2]:
# llm = HuggingFaceEndpoint(
#     repo_id="HuggingFaceH4/zephyr-7b-beta",
#     task="text-generation",
#     max_new_tokens=512,
#     do_sample=False,
#     repetition_penalty=1.03,
# )

# chat_model = ChatHuggingFace(llm=llm)

### Test run

In [3]:
messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(content="What happens when an unstoppable force meets an immovable object?"),
]

In [4]:
# ai_msg = chat_model.invoke(messages)
# print(ai_msg.content)

In [5]:
# type(ai_msg)

The HuggingFaceEndpoint only returns the `AIMessage`, not the chat history

## Instantiate from Pipeline

In [6]:


# llm = HuggingFacePipeline.from_model_id(
#     model_id="HuggingFaceH4/zephyr-7b-beta",
#     task="text-generation",
#     pipeline_kwargs=dict(
#         max_new_tokens=512,
#         do_sample=False,
#         # repetition_penalty=1.03,
#     ),
# )


In [7]:
# chat_model = ChatHuggingFace(llm=llm)
# ai_msg = chat_model.invoke(messages)
# print(ai_msg.content)

In [8]:
# # To get just the ai response, you must parse
# if "<|assistant|>" in ai_msg.content:
#     response = ai_msg.content.split("<|assistant|>")[-1]
#     print(response)

## Instantiate with Pipeline and Quantization: only supported on CUDA

In [9]:
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="float16",
#     bnb_4bit_use_double_quant=True,
# )

# llm = HuggingFacePipeline.from_model_id(
#     model_id="HuggingFaceH4/zephyr-7b-beta",
#     task="text-generation",
#     pipeline_kwargs=dict(
#         max_new_tokens=512,
#         do_sample=False,
#         repetition_penalty=1.03,
#         return_full_text=False,
#     ),
#     model_kwargs={"quantization_config": quantization_config},
# )

# chat_model = ChatHuggingFace(llm=llm)
# ai_msg = chat_model.invoke(messages)
# print(ai_msg.content)

In [10]:
# Create a pydantic model for a field "names", which will be a list of strings
from pydantic import BaseModel, Field

class NameList(BaseModel):
    names: list[str] = Field(
        default=[],
        description="A list of names",
        title="Names",
        example=["Smith", "Jones", "van der Merwe"],
    )
    # Optional: Add an example to the schema
    # This is not necessary for the output parser to work, but can be useful for documentation
    # and validation purposes.

    class Config:
        json_schema_extra = {
            "example": {
                "names": ["Smith", "Jones", "van der Merwe"],
            }
        }

from langchain_core.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=NameList)

prompt = """You're a helpful assistant that only returns JSON. 
Read the following text and write out a JSON object with a "names" key having a string array containing all the names of people in the text.

<examples>
Input: "Smith and Jones went to the store"
Output: {"names": ["Smith", "Jones"]}

Input: "Smith and Jones went to the store. van der Merwe was there too."
Output: {"names": ["Smith", "Jones", "van der Merwe"]}
</examples>

Input: "See Dickenson 2017 or Brekki 2018 for more information on the topic"
Output: 
"""

# Use the PydanticOutputParser to get the format instructions
pd_prompt = f"""You're a helpful assistant that only returns JSON. 
Read the following text and write out a JSON object with a "names" key having a string array containing all the names of people in the text.
"""



In [11]:
system_message = SystemMessage(content=pd_prompt)
human_message = HumanMessage(
    content="Write out the names from this text: See Dickenson 2017 or Brekki 2018 for more information on the topic"
)

In [12]:
# ai_msg = chat_model.invoke([system_message, human_message])
# print(ai_msg.content)

## Pipeline for Llama

In [13]:
# Use AutoTokenizer and AutoModelForCausalLM to load the model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
)
print(f"Tokenizer pad token: {tokenizer.pad_token}")
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # max_length=512,
    do_sample=False,
    top_p=1.0,
    temperature=0.001,
)

llm = HuggingFacePipeline(pipeline=pipe)
chat_model = ChatHuggingFace(llm=llm)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use mps


Tokenizer pad token: None


In [14]:
# Regular response
ai_msg = chat_model.invoke(messages)
ai_msg.pretty_print()




<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You're a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

What happens when an unstoppable force meets an immovable object?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The concept of an unstoppable force meeting an immovable object is a classic thought experiment in philosophy, particularly


In [15]:
# With expected formatted output
ai_msg = chat_model.invoke([system_message, human_message])
ai_msg.pretty_print()




<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You're a helpful assistant that only returns JSON. 
Read the following text and write out a JSON object with a "names" key having a string array containing all the names of people in the text.<|eot_id|><|start_header_id|>user<|end_header_id|>

Write out the names from this text: See Dickenson 2017 or Brekki 2018 for more information on the topic<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{
  "names": ["Dickenson", "Brekki"]
}


In [16]:
if "assistant<|end_header_id|>" in ai_msg.content:
    response = ai_msg.content.split("assistant<|end_header_id|>")[-1]
    print(response)





{
  "names": ["Dickenson", "Brekki"]
}


In [17]:
parser.parse(response)

NameList(names=['Dickenson', 'Brekki'])

### Pydantic prompt instructions

In [20]:
prompt = f"""
Extract the names from the text and return them as JSON.
{parser.get_format_instructions()}

DO NOT write any code or any text except the JSON object.

Now read the following text and write out the JSON object as described above:
"""

system_message = SystemMessage(content=prompt)
human_message = HumanMessage(content="Text: 'See Dickenson 2017 or Brekki 2018 for more information on the topic'")
ai_msg = chat_model.invoke([system_message, human_message])
ai_msg.pretty_print()




<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

Extract the names from the text and return them as JSON.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"example": {"names": ["Smith", "Jones", "van der Merwe"]}, "properties": {"names": {"default": [], "description": "A list of names", "example": ["Smith", "Jones", "van der Merwe"], "items": {"type": "string"}, "title": "Names", "type": "array"}}}
```

DO NOT write any code or any text except the JSON object.

Now read the following text and write out the JSON obje