# Getting started
We're running this code from google colab.

In [None]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade
# !pip install langchain einops accelerate transformers bitsandbytes scipy
# !pip install xformers sentencepiece
# !pip install llama-index==0.7.21 llama_hub==0.0.19

In [None]:
import os
from langchain.llms import LlamaCpp
from langchain import PromptTemplate,LLMChain
from transformers import LlamaForCausalLM, LlamaTokenizer
from llama_index import LLMPredictor, PromptHelper, GPTVectorStoreIndex
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding

# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Import deps to load documents
from llama_index import VectorStoreIndex, download_loader
from pathlib import Path

In [None]:
max_input_size = 300
num_output = 120
max_chunk_overlap = 0
prompt_helper = PromptHelper(max_input_size, num_output,max_chunk_overlap)

In [None]:
MODEL_PATH = r"D:/llama2_quantized_models/7B_chat/llama-2-7b-chat.ggmlv3.q5_K_M.bin"

# Use CUDA GPU
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path= MODEL_PATH,
    max_tokens=256,
    n_gpu_layers=35,
    n_batch= 512, #256,
    callback_manager=callback_manager,
    n_ctx= 1024,
    verbose=False,
    temperature=0,
)

In [None]:
# Method 1
llm_predictor = llm
#llm.predict("What is flatpv ?")
#llm_predictor = llm

In [None]:
# Create and dl embeddings instance
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
)

In [None]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    #chunk_size=500,
    llm=llm_predictor,
    embed_model=embeddings,
    prompt_helper=prompt_helper
)

# And set the service context
set_global_service_context(service_context)

In [None]:
# Download PDF Loader
PyMuPDFReader = download_loader("PyMuPDFReader")

# Create PDF Loader
loader = PyMuPDFReader()

In [None]:
# Load documents
file = r"C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_pdf_chatbot_faiss_windows/data/V3/Hotline_Wiki_v3.pdf"
documents = loader.load(file_path=Path(file), metadata=True)

In [None]:
# Create an index - we'll be able to query this in a sec
#index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
# Setup index query engine using LLM
query_engine = index.as_query_engine()

In [None]:
# Test out a query in natural
#query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("why Q1SU rule flag ?")
#response.print_response_stream() 
print(response)