In [36]:
!pip install torch langchain langchain-community langchain-core langchain-google-genai langchain-openai langchain-text-splitters huggingface_hub transformers



In [None]:
!pip install blobfile tiktoken SentencePiece accelerate>=0.26.0


# This program demonstrates a simple usage of creating a LCEL based chain 
# The chain comprises a prompt, the llm object and a Stringoutput parser

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain import HuggingFaceHub
from dotenv import load_dotenv
import os


# Create the Hugging Face Hub LLM Object
# Hugging Face Hub LLM
llm = HuggingFaceHub(
    repo_id="meta-llama/Llama-2-7B",
    huggingfacehub_api_token="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",
    model_kwargs={"temperature": 0.7, "max_length": 50},
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a world-class technical documentation writer."),
        ("user", "{input}"),
    ]
)

output_parser = StrOutputParser()

chain = prompt | llm | output_parser

output = chain.invoke({"input": "how can langsmith help with testing?"})

print(output)

# from transformers import AutoTokenizer, AutoModelForCausalLM
# import os

# # Define the model name
# model_name = "openlm-research/open_llama_3b"

# # Retrieve the Hugging Face token from an environment variable
# huggingface_token = "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"

# # Load the tokenizer and model with the token
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=huggingface_token)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype="float16",
#     use_auth_token=huggingface_token
# )


from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "openlm-research/open_llama_3b"

# Explicitly use the SentencePiece tokenizer if required
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype="float16",
    trust_remote_code=True
)


In [None]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_path = 'openlm-research/open_llama_3b'
# model_path = 'openlm-research/open_llama_7b'

tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)

prompt = 'Q: What is the largest animal?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=32
)
print(tokenizer.decode(generation_output[0]))

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.base_language import BaseLanguageModel
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch


# Load the local Hugging Face model
class LocalLLM(BaseLanguageModel):
    def __init__(self, model_path: str, **model_kwargs):
        """
        Initialize the LocalLLM with the model path and optional model parameters.
        """
        self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
        self.model = LlamaForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            **model_kwargs
        )

    def invoke(self, prompt: str) -> str:
        """
        Tokenize the input prompt, generate a response, and decode the output.
        """
        # Tokenize the input
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        # Generate response
        outputs = self.model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7)
        # Decode and return
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)




from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.base_language import BaseLanguageModel
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from pydantic import Field


class LocalLLM(BaseLanguageModel):
    model_path: str  # Define model path as a Pydantic field
    tokenizer: LlamaTokenizer = Field(init=False)  # Declare tokenizer
    model: LlamaForCausalLM = Field(init=False)  # Declare model

    def __init__(self, model_path: str, **model_kwargs):
        """
        Initialize the LocalLLM with the model path and optional model parameters.
        """
        super().__init__(model_path=model_path)  # Initialize Pydantic BaseModel
        self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
        self.model = LlamaForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            **model_kwargs
        )

    def invoke(self, input: dict) -> str:
        """
        Main method required by BaseLanguageModel to process inputs and produce outputs.
        Expects a dictionary with a single key 'input'.
        """
        prompt = input.get("input", "")
        if not prompt:
            raise ValueError("Input dictionary must contain a key 'input' with a valid prompt string.")
        return self.generate_prompt(prompt)

    def generate_prompt(self, prompt: str) -> str:
        """
        Generate a response for a single prompt.
        """
        # Tokenize the input
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        # Generate response
        outputs = self.model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7)
        # Decode and return
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def predict(self, text: str) -> str:
        """
        Alias for generate_prompt to comply with LangChain's API.
        """
        return self.generate_prompt(text)

    def predict_messages(self, messages: list) -> str:
        """
        Process a list of messages (e.g., for chat-style prompts).
        """
        # Combine all messages into a single input prompt
        combined_prompt = "\n".join(message["content"] for message in messages)
        return self.generate_prompt(combined_prompt)

    async def agenerate_prompt(self, prompt: str) -> str:
        """
        Asynchronous method for generating a response to a prompt.
        """
        return self.generate_prompt(prompt)

    async def apredict(self, text: str) -> str:
        """
        Asynchronous alias for predict.
        """
        return await self.agenerate_prompt(text)

    async def apredict_messages(self, messages: list) -> str:
        """
        Asynchronous method for processing chat-style prompts.
        """
        combined_prompt = "\n".join(message["content"] for message in messages)
        return await self.agenerate_prompt(combined_prompt)



# Rebuild the Pydantic model to resolve the "not fully defined" error
LocalLLM.model_rebuild()
# Specify the model path
model_path = 'openlm-research/open_llama_3b'

# Initialize the LocalLLM
llm = LocalLLM(model_path=model_path)

# Test with a single prompt
prompt = "Q: What is the largest animal?\nA:"
response = llm.invoke({"input": prompt})
print("Response:", response)

# Test with chat-style messages
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the largest animal on Earth?"}
]
response = llm.predict_messages(messages)
print("Chat Response:", response)        


import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

class LanguageModel:
    def __init__(self, model_path: str, device: str = "auto", torch_dtype=torch.float16):
        """
        Initialize the tokenizer and model.

        :param model_path: Path to the pre-trained model.
        :param device: Device map for loading the model. Default is 'auto'.
        :param torch_dtype: Data type for model parameters. Default is torch.float16.
        """
        self.model_path = model_path
        self.device = device
        self.torch_dtype = torch_dtype
        self.tokenizer = None
        self.model = None

    def load_model(self):
        """Loads the tokenizer and model."""
        self.tokenizer = LlamaTokenizer.from_pretrained(self.model_path)
        self.model = LlamaForCausalLM.from_pretrained(
            self.model_path,
            torch_dtype=self.torch_dtype,
            device_map=self.device,
        )
        print(f"Model and tokenizer loaded from {self.model_path}")
        return self  # Ensure the object is returned for chaining

    def generate_response(self, prompt: str, max_new_tokens: int = 32) -> str:
        """
        Generate a response for a given prompt.

        :param prompt: Input prompt string.
        :param max_new_tokens: Maximum number of new tokens to generate.
        :return: Generated response as a string.
        """
        if self.tokenizer is None or self.model is None:
            raise ValueError("Model and tokenizer must be loaded first using load_model().")
        
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
        output_ids = self.model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens
        )
        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return response


# Example usage:
if __name__ == "__main__":
    model_path = 'openlm-research/open_llama_3b'  # Change to your desired model
    lm = LanguageModel(model_path)
    lm.load_model()
    prompt = "Q: What is the largest animal?\nA:"
    response = lm.generate_response(prompt)
    print(response)


In [39]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
from langchain_core.runnables.base import Runnable

class LanguageModel(Runnable):
    def __init__(self, model_path: str, device: str = "auto", torch_dtype=torch.float16):
        self.model_path = model_path
        self.device = device
        self.torch_dtype = torch_dtype
        self.tokenizer = None
        self.model = None

    def load_model(self):
        """Loads the tokenizer and model."""
        self.tokenizer = LlamaTokenizer.from_pretrained(self.model_path)
        self.model = LlamaForCausalLM.from_pretrained(
            self.model_path,
            torch_dtype=self.torch_dtype,
            device_map=self.device,
        )
        print(f"Model and tokenizer loaded from {self.model_path}")
        return self  # Ensure the object is returned for chaining

    def invoke(self, input: dict, config=None) -> str:
        """
        Makes the model runnable by implementing the `invoke` method.
        Accepts a dictionary input from LangChain's pipeline.
        """
        if self.tokenizer is None or self.model is None:
            raise ValueError("Model and tokenizer must be loaded first using load_model().")

        # Extract the input text from the dictionary
        # if "input" not in input:
        #     raise ValueError("Expected a dictionary with an 'input' key.")
        
        prompt = input["input"]  # Extract the actual string input
    
    
        # Tokenize input
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
    
        # Generate output
        output_ids = self.model.generate(
        input_ids=input_ids,
        max_new_tokens=32
        )
        
        # Decode and return response
        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return response


In [40]:
model_path = 'openlm-research/open_llama_3b'  # Change to your desired model
llm = LanguageModel(model_path).load_model()



Model and tokenizer loaded from openlm-research/open_llama_3b


In [41]:
from langchain.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the model path
model_path = 'openlm-research/open_llama_3b'

# Load the language model
llm = LanguageModel(model_path).load_model()

# Define the prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a world-class technical documentation writer."),
    ("user", "{input}")
])

# Define the output parser
output_parser = StrOutputParser()

# Create the chain (prompt | llm | output_parser)
chain = prompt | llm | output_parser

# Run the chain
output = chain.invoke({"input": "how can langsmith help with testing?"})

print(output)


Model and tokenizer loaded from openlm-research/open_llama_3b


TypeError: 'ChatPromptValue' object is not subscriptable