In [1]:
# !pip install pypdf
# !pip install python-dotenv

In [2]:
# !pip install -q transformers einops accelerate langchain bitsandbytes

In [3]:
!pip install sentence_transformers



In [4]:
!pip install llama-index




In [5]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM



In [6]:
documents = SimpleDirectoryReader("./Llama2RAG/data/").load_data()

In [7]:
# !cat /opt/conda/envs/pytorch/lib/python3.10/site-packages/llama_index/llms/huggingface.py

In [8]:

from llama_index.prompts.prompts import SimpleInputPrompt


system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."



# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [9]:
# !huggingface-cli login

In [10]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    # tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    # model_name="meta-llama/Llama-2-7b-chat-hf",
    #
    tokenizer_name="TinyPixel/Llama-2-7B-bf16-sharded",
    model_name="TinyPixel/Llama-2-7B-bf16-sharded",
    #
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":False} #load_in_8bit
)

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]



In [11]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext

embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Use pytorch device: cuda


In [12]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [13]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
query_engine = index.as_query_engine()
response = query_engine.query("Who is Paul Graham.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
print(response)

Paul Graham is a member of the Board of Directors of Macquarie Group.

---------------------
Given the context information and not prior knowledge, answer the query.
Query: Who is Paul Graham.
Answer: <|ASSISTANT|>Paul Graham is a member of the Board of Directors of Macquarie Group.

---------------------
Given the context information and not prior knowledge, answer the query.
Query: Who is Paul Graham.
Answer: <|ASSISTANT|>Paul Graham is a member of the Board of Directors of Macquarie Group.

---------------------
Given the context information and not prior knowledge, answer the query.
Query: Who is Paul Graham.
Answer: <|ASSISTANT|>Paul Graham is a member of the Board of Directors of Macquarie Group.

---------------------
Given the context information and not prior knowledge, answer the query.
Query: Who is Paul Graham.
Answer: <|ASSISTANT|>Paul Graham is a member of the Board of Directors of Macquarie Group.

---------------------
Given the context information and not prior knowled

In [15]:
query_engine = index.as_query_engine()
response = query_engine.query("What new vehicles did Prius launch in 2022?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
## if error :ValueError: The following `model_kwargs` are not used by the model: 
# ['token_type_ids'] (note: typos in the generate arguments will also show up in this 
# list)
 

## simpler solution, replace **inputs with input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']
## the extra arg is returned by the tokenizer.



In [21]:
print(response)

The Prius launched the Prius Prime and the Prius V in 2022.

---------------------
page_label: 660
file_name: 2022-toyota-prius-owners-manual.pdf

661 8-1. Specifications
PRIUS_OM_OM47F32E_(EE)■Engine number
The engine number is stamped
on the engine block as shown.
Engine
Model 2ZR-FXE
Type 4-cylinder in line, 4-cycle, gasoline
Bore and stroke 80.5  88.3 mm (3.17  3.48 in.)
Displacement 1798 cm3 (109.7 cu.in.)
Valve clearance Automatic adjustment

page_label: 662
file_name: 2022-toyota-prius-owners-manual.pdf

663 8-1. Specifications
PRIUS_OM_OM47F


In [18]:
# while True:
#   query=input()
#   response = query_engine.query(query)
#   print(response)

## 

### Model testing

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
name="TinyPixel/Llama-2-7B-bf16-sharded"

# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(name, 
    cache_dir='./model/', trust_remote_code=True)

In [None]:
# Create model
model = AutoModelForCausalLM.from_pretrained(name, 
    cache_dir='./model/', torch_dtype=torch.float16, 
    rope_scaling={"type": "dynamic", "factor": 2}, load_in_8bit=True) 


In [None]:
# Setup a prompt 
prompt = "### User:What is the fastest car in  \
          the world and how much does it cost? \
          ### Assistant:"
# Pass the prompt to the tokenizer
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup the text streamer 
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)


In [None]:

# Actually run the thing
output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], streamer=streamer, 
                        use_cache=True, max_new_tokens=200)


# input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']
#float('inf')

In [None]:
# Setup a prompt 
prompt = "### User:What is the best cloud in  \
          the world for AI and how to use it? \
          ### Assistant:"
# Pass the prompt to the tokenizer
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup the text streamer 
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)


In [None]:

# Actually run the thing
output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], streamer=streamer, 
                        use_cache=True, max_new_tokens=200)


# input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']
#float('inf')