In [None]:
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage, AIMessage
from langgraph.graph import StateGraph, END
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain_core.runnables import Runnable
from langchain.agents import ToolExecutor, AgentExecutor, RunnableAgent
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.agents.agent import create_openai_functions_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import Tool

In [None]:
# Step 1: Define the tool
@tool
def calculator(expression: str) -> str:
    """Evaluates a basic mathematical expression like 2+3*4"""
    try:
        return str(eval(expression, {"__builtins__": {}}, {}))
    except Exception as e:
        return f"Error: {e}"

tools = [Tool.from_function(calculator)]

# Step 2: Load HuggingFace LLM (e.g., Mistral)
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype="auto")

hf_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
llm = HuggingFacePipeline(pipeline=hf_pipe)

# Step 3: Build the agent with OpenAI function calling style
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a math assistant. Use the calculator tool when needed."),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder("agent_scratchpad"),
])

agent_runnable: Runnable = create_openai_functions_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent_runnable, tools=tools, verbose=True)

# Step 4: LangGraph definition

# Define state
from typing import TypedDict, List

class AgentState(TypedDict):
    input: str
    chat_history: List
    agent_scratchpad: List
    output: str

# Agent node
def agent_node(state: AgentState) -> AgentState:
    result = agent_executor.invoke({
        "input": state["input"],
        "chat_history": state["chat_history"],
        "agent_scratchpad": state["agent_scratchpad"]
    })
    return {
        "input": state["input"],
        "chat_history": state["chat_history"] + [HumanMessage(content=state["input"]), AIMessage(content=result["output"])],
        "agent_scratchpad": [],
        "output": result["output"]
    }

# Build the graph
graph = StateGraph(AgentState)
graph.add_node("agent", agent_node)
graph.set_entry_point("agent")
graph.set_finish_point("agent")  # End after one full run (you can loop with conditions too)

# Compile the graph
runnable_graph = graph.compile()

# Run it
input_question = "If I invest 3000 at 8% and 5000 at 12%, what is my total interest?"
result = runnable_graph.invoke({
    "input": input_question,
    "chat_history": [],
    "agent_scratchpad": [],
    "output": ""
})

print("\nFinal Answer:", result["output"])


---

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "meta-llama/Llama-2-13b-chat-hf"  # Or another 14B equivalent

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # load_in_8bit=True,                 # For 8-bit
    load_in_4bit=True,              # For 4-bit, use instead
    device_map="auto",
    torch_dtype=torch.float16
)

inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


---

In [None]:
pip install unsloth

In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/qwen2-14b",      # Qwen3 is alias for Qwen2 14B
    max_seq_length = 4096,
    dtype = "auto",                        # or torch.float16
    load_in_4bit = True                    # Loads in 4-bit with bitsandbytes
)

# Optional: Speed up
FastLanguageModel.for_inference(model)

# Test generation
inputs = tokenizer("Write a short poem about AI:", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))


---

In [None]:
pip install git+https://github.com/huggingface/transformers.git
pip install accelerate

In [None]:
pip install bitsandbytes

In [None]:
from transformers import AutoTokenizer
from transformers.quantizers import QuantoQuantizer

model_id = "Qwen/Qwen2-14B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

quantizer = QuantoQuantizer.from_pretrained(
    model_id,
    quantization_config={"load_in_4bit": True},
    device_map="auto",
    torch_dtype="auto"
)

model = quantizer.get_model()