In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
import transformers
from torch import bfloat16
import time
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

import os

from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

In [None]:
bnb_config_4 = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=bfloat16
)

model_id = "meta-llama/Llama-2-7b-chat-hf"
model_id = "Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir='models/', quantization_config=bnb_config_4,  device_map={"": 0}) # cache_dir  


pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    pad_token_id = 50256,
    repetition_penalty=1.1
)

local_llm = HuggingFacePipeline(pipeline=pipe) #model_kwargs={"temperature":1e-10}

In [None]:
prompt_template = """Please provide honest and straightforward answers to the following questions. Avoid providing misleading or inaccurate information. Your responses are expected to be truthful and directly address the questions asked.

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["question"]
)



In [None]:
chain = LLMChain(llm=local_llm, prompt=PROMPT)

In [None]:
%%time

# full example
query = "Write a short peom about rain"
llm_response = chain.run(query)

print(prompt_template.format(question=query) + llm_response)
print("\n\n")