### Installing the dependencies

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade

Looking in indexes: https://download.pytorch.org/whl/cu117


In [2]:
!pip install langchain einops accelerate transformers bitsandbytes scipy



In [3]:
!pip install xformers sentencepiece 



In [4]:
!pip install llama-index==0.7.21 llama_hub==0.0.19





### Import transformer classes for generation

In [5]:
!pip install transformers --upgrade
!pip show transformers
# Import transformer classes for generaiton
from transformers import AutoTokenizer, AutoModelForCausalLM
# Import torch for datatype attributes 
import torch

Name: transformers
Version: 4.39.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/anishsoni/anaconda3/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
name = "meta-llama/Llama-2-7b-chat-hf"
auth_token = "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"

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-6603330d-7eec2f6416be35db42469c30;fccb01d3-1cc8-4b29-95d2-d61c4cd5e3e5)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-7b-chat-hf is gated. You must be authenticated to access it.

In [None]:
class MyStreamer:
   def __init__(self):
      pass
   def put(self, token):
      print(f"The next token is {token}")
   def end():
      pass
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)

In [None]:
# Setup a prompt 
prompt = "### User:What is the fastest car in  \
          the world and how much does it cost? \
          ### Assistant:"

In [None]:
# Pass the prompt to the tokenizer
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup the text streamer 
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)    

In [None]:
# Make it running
output = model.generate(**inputs, streamer=streamer, 
                        use_cache=True, max_new_tokens=float('inf'))

In [None]:
# Convert the output tokens back to text
output_text = tokenizer.decode(output[0], skip_special_tokens = True)

In [None]:
# Import the prompt wrapper...but for llama index
from llama_index.prompts.prompts import SimpleInputPrompt
# Create a system prompt 
system_prompt = """[INST] <>
You are a helpful, respectful and honest assistant. Always answer as 
helpfully as possible, while being safe. Your answers should not include
any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain 
why instead of answering something not correct. If you don't know the answer 
to a question, please don't share false information.

Your goal is to provide answers relating to the financial performance of 
the company.<>
"""
# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [None]:
# Complete the query prompt
query_wrapper_prompt.format(query_str = 'hello')

In [None]:
# Importing the HuggingFace wrapper for Llama Index
from llama_index.llms import HuggingFaceLLM

# Create a HF LLM using the llama index wrapper 
llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=256,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

In [None]:
# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding
# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
     

In [None]:
# Create and dl embeddings instance  
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [None]:
# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext
     

In [None]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)
# And set the service context
set_global_service_context(service_context)

In [None]:
# Import deps to load documents 
from llama_index import VectorStoreIndex, download_loader
from pathlib import Path

In [None]:
# Download PDF Loader 
PyMuPDFReader = download_loader("PyMuPDFReader")
# Create PDF Loader
loader = PyMuPDFReader()
# Load documents 
documents = loader.load(file_path=Path('./data/annualreport.pdf'), metadata=True)

In [None]:
# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

In [None]:
# Setup index query engine using LLM 
query_engine = index.as_query_engine()

In [None]:
# Test out a query in natural
response = query_engine.query("what was the FY2021 ROE?")