# Getting Started
Method to use `llama_cpp`

In [None]:
# Import transformer classes for generaiton
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# Import torch for datatype attributes
import torch
from llama_cpp import Llama

from langchain.llms import LlamaCpp


In [None]:
model_path = r"D:/llama2_quantized_models/7B_q5/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
model_name = "llama-2-7b-chat.ggmlv3.q5_K_M.bin"

llm = Llama(model_path=model_path,
            n_gpu_layers=32, 
            n_ctx=8192, 
            n_batch=512)

In [None]:
prompt = "write me a resignation letter"
output = llm(prompt, 
                 max_tokens=-1, 
                 echo=False, 
                 temperature=0.2, 
                 top_p=0.1)

print(output['choices'][0]['text'])

# Method to use RAG
This method we're using `RAG` with `llama_cpp`

In [None]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate
from langchain import LLMChain

In [None]:
MODEL_PATH = r"D:/llama2_quantized_models/7B_chat/llama-2-7b-chat.ggmlv3.q4_K_M.bin"

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path= MODEL_PATH,
    max_tokens=-1,
    n_gpu_layers=32,
    n_batch= 512, #256,
    callback_manager=callback_manager,
    n_ctx= 8192, #2048, #1024,
    verbose=False,
    temperature=0.8,
)

In [None]:
# Import transformer classes for generaiton
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# Import torch for datatype attributes
import torch

import os
from langchain.llms import LlamaCpp
from langchain import PromptTemplate,LLMChain
from transformers import LlamaForCausalLM, LlamaTokenizer
from llama_index import LLMPredictor, PromptHelper, GPTVectorStoreIndex
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

# Bring in embeddings wrapper
from llama_index.embeddings import LangchainEmbedding

# Bring in HF embeddings - need these to represent document chunks
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Import deps to load documents
from llama_index import VectorStoreIndex, download_loader
from pathlib import Path

# Bring in stuff to change service context
from llama_index import set_global_service_context
from llama_index import ServiceContext

In [None]:
# Method 1
llm_predictor = llm

In [None]:
# Create and dl embeddings instance
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
)

In [None]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=500,
    llm=llm,
    embed_model=embeddings
)
# And set the service context
set_global_service_context(service_context)

In [None]:
# Download PDF Loader
PyMuPDFReader = download_loader("PyMuPDFReader")

# Create PDF Loader
loader = PyMuPDFReader()

In [None]:
# Load documents
file = r"C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_pdf_chatbot_faiss_windows/data/V3/Hotline_Wiki_v3.pdf"
documents = loader.load(file_path=Path(file), metadata=True)

In [None]:
# Create an index - we'll be able to query this in a sec
index = VectorStoreIndex.from_documents(documents)

In [None]:
# Setup index query engine using LLM
query_engine = index.as_query_engine()

In [None]:
# Test out a query in natural
response = query_engine.query("how to setup alpha pdk using json file ?")