In [None]:
#If not installed
#!pip install transformers torch

In [None]:
#If not installed
#!pip install -U langchain-huggingface

In [None]:
import torch
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate



In [None]:
# Load the model and tokenizer locally
model_name = "google/flan-t5-base"  # You can also use "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create a text generation pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,  # Uses lower precision for efficiency
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cpu


In [None]:
#To control generation settings
generation_kwargs = {
    "temperature": 0.7,   # Controls randomness (higher = more creative)
    #"max_length": 512,    # Max number of tokens in response
    "max_new_tokens": 20, # max tokens
    #"min_length": 150,      # Forces at least 150 words (~150 tokens)
    "top_p": 0.9,         # Nucleus sampling (higher = more diverse responses)
    "top_k": 50,          # Limits the number of top tokens considered
    "repetition_penalty": 1.2,  # Penalizes repetition (1.0 = no penalty)
    "do_sample": True,    # Enables sampling (for creative responses)
}


In [None]:
# Wrap pipeline in LangChain's HuggingFacePipeline with parameters
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs=generation_kwargs)

In [None]:
# Define prompt template
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])

In [None]:
# Example questions
questions = [
    "Explain the concept of black holes in simple terms.",
    "What are the main causes of climate change, and how can we address them?",
    "Provide a brief overview of the history of artificial intelligence."
]

In [None]:
#RunnableSequence
# from langchain.schema.runnable import RunnableSequence
chain = prompt | llm

for q in questions:
     print(f"\nQ: {q}")
     print(chain.invoke({"question": q}))


Q: Explain the concept of black holes in simple terms.
A black hole is a small hole in the center of a star. A black hole is a large hole in the center of a star. The answer: black holes.

Q: What are the main causes of climate change, and how can we address them?
Climate change is caused by human activities. Human activities cause climate change. The answer: climate change.

Q: Provide a brief overview of the history of artificial intelligence.
Artificial Intelligence (AI) is the development of artificial intelligence. Artificial Intelligence (AI) is the development of artificial intelligence. The answer: artificial intelligence.


In [None]:
##Creating separate llmchain instances - using different models

In [None]:
# Define prompt
template = "Question: {question}\nAnswer: Let's think step by step."
prompt = PromptTemplate(template=template, input_variables=["question"])

In [None]:
# Load the model and tokenizer locally
model1_name = "google/flan-t5-base"  # You can also use "google/flan-t5-xl"
#model2_name = "tiiuae/falcon-7b-instruct"
model2_name = "google/flan-t5-large"


In [None]:
# Create a text generation pipeline
pipe1 = pipeline(
    "text2text-generation",
    model=model1_name,
    torch_dtype=torch.float32,  # Uses lower precision for efficiency
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

Device set to use cpu


In [None]:
#If using falcon, we could use this, note the change in task
"""
pipe2 = pipeline(
    "text-generation",
    model=model2_name,
    torch_dtype=torch.float32,  # Uses lower precision for efficiency
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)
"""

'\npipe2 = pipeline(\n    "text-generation",\n    model=model2_name,\n    torch_dtype=torch.float32,  # Uses lower precision for efficiency\n    device=0 if torch.cuda.is_available() else -1  # Use GPU if available\n)\n'

In [None]:
pipe2 = pipeline(
    "text2text-generation",
    model=model2_name,
    torch_dtype=torch.float32,  # Uses lower precision for efficiency
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
llm1 = HuggingFacePipeline(pipeline=pipe1)
llm2= HuggingFacePipeline(pipeline=pipe2)

In [None]:
chain1 = prompt | llm1
chain2 = prompt | llm2



In [None]:
# Use a model based on user choice
question = "What is quantum mechanics?"
model_choice = "flan-t5"  # Example of selecting a model dynamically

In [None]:
if model_choice == "flan-t5":
    response = chain1.invoke({"question": question})
else:
    response = chain2.invoke({"question": question})

print(response)

Quantum mechanics is the study of the physical properties of matter. Quantum mechanics is the study of the physical properties of matter. The answer: quantum mechanics.


In [None]:
#Using a Function to Dynamically Select the Model (uncmment below code)
#Shared Prompt Template
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("human", "{input}")  # The input variable is "input"
])

#Function to create a HuggingFacePipeline for any model
def create_hf_llm(model_name: str):
    pipe = pipeline(
        "text2text-generation",
        model=model_name,
        torch_dtype=torch.float32,
        device=0 if torch.cuda.is_available() else -1
    )
    return HuggingFacePipeline(pipeline=pipe)

#Instantiate models
# Map model names to HuggingFace model IDs
model_map = {
    "flan-t5-base": "google/flan-t5-base",
    "flan-t5-large": "google/flan-t5-large"
}

# Wrap them as HuggingFacePipeline objects
llm_map = {name: create_hf_llm(path) for name, path in model_map.items()}

# Build LCEL-style chains: prompt | llm
chain_map = {name: prompt | llm for name, llm in llm_map.items()}


Device set to use cpu
Device set to use cpu


In [None]:
#Function to dynamicaly invoke the chain
def ask_question(question: str, model_choice: str):
    if model_choice not in chain_map:
        raise ValueError(f"Model {model_choice} not found. Choose from {list(chain_map.keys())}")

    chain = chain_map[model_choice]
    response = chain.invoke({"input": question})  # key must match prompt variable
    return response

In [None]:
question = "Explain relativity in simple terms."

response_base = ask_question(question, "flan-t5-base")
print("FLAN-T5-Base:", response_base)

response_large = ask_question(question, "flan-t5-large")
print("FLAN-T5-Large:", response_large)

FLAN-T5-Base: A compass is used to measure the distance between two points on a compass.
FLAN-T5-Large: Relativity is the law that states that the mass of an object is proportional to the distance between the object and the observer.
