In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [2]:
model_url="../model/ggml-model-q4_0.bin"

In [3]:

# Make sure the model path is correct for your system!
llm = LlamaCPP(
    temperature=0.80,
    model_path=model_url,
    verbose=True, 
    max_new_tokens=256,
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    # model_kwargs={"n_gpu_layers": -1},
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
)

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../model/ggml-model-q4_0.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama-main
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32    

In [4]:
response = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(response.text)


llama_print_timings:        load time =    6579.64 ms
llama_print_timings:      sample time =      20.61 ms /   171 runs   (    0.12 ms per token,  8296.14 tokens per second)
llama_print_timings: prompt eval time =    6579.42 ms /    79 tokens (   83.28 ms per token,    12.01 tokens per second)
llama_print_timings:        eval time =    8915.36 ms /   170 runs   (   52.44 ms per token,    19.07 tokens per second)
llama_print_timings:       total time =   15802.79 ms /   249 tokens


  Of course! Here is a poem about cats and dogs:
Cats and dogs, so furry and sweet,
Bringing joy to our hearts, they can't be beat.
Their playful pounces and wagging tails,
Bring smiles to our faces, without fail.

Cats with their sassy attitudes, so fine,
And dogs with their loyalty, so divine.
They brighten our days with their playful ways,
And bring us happiness in their own special ways.

So here's to our feline and canine friends,
Bringing joy to the end, until it bends.
Thank you for being such loyal companions,
And for filling our lives with your love and devotion.


In [5]:
response_iter = llm.stream_complete("Can you write me a poem about fast cars?")
for response in response_iter:
    print(response.delta, end="", flush=True)

Llama.generate: prefix-match hit


  Of course! Here is a poem about fast cars:
Racing down the highway, wind in my hair
Fast car at my command, thrill of the chase
The engine purrs beneath me, as I speed through the air
A feeling of freedom, with no time to spare
The road stretches out before me, a blank canvas to explore
I push the pedal to the floor, feeling alive once more
The rush of adrenaline, as I hit the accelerator hard
A roar of power, as I speed past the yard
The world blurs around me, but I feel alive and free
In this moment, I am one with the car and the road
Fast cars, they take me where I want to go
A feeling of exhilaration, as I drive with ease
The thrill of the ride, the rush of the chase
Fast cars, they take me to places I've never been before


llama_print_timings:        load time =    6579.64 ms
llama_print_timings:      sample time =      25.44 ms /   209 runs   (    0.12 ms per token,  8214.76 tokens per second)
llama_print_timings: prompt eval time =     650.00 ms /    14 tokens (   46.43 ms per token,    21.54 tokens per second)
llama_print_timings:        eval time =   11479.72 ms /   208 runs   (   55.19 ms per token,    18.12 tokens per second)
llama_print_timings:       total time =   12593.75 ms /   222 tokens


In [6]:
from llama_index.core import set_global_tokenizer
from transformers import AutoTokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

In [7]:
# use Huggingface embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [8]:
# load documents
documents = SimpleDirectoryReader(
    "../paul_graham"
).load_data()

documents

[Document(id_='6f0ca182-5b11-44d3-9731-e28c034c2535', embedding=None, metadata={'page_label': '1', 'file_name': 'COI...pdf', 'file_path': '/Users/abhaychoudhary/Documents/Abhay/LLAMA2/LLAMA2-RAG/RAG_lamaindex/../paul_graham/COI...pdf', 'file_type': 'application/pdf', 'file_size': 3922267, 'creation_date': '2024-03-21', 'last_modified_date': '2024-03-21'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text=' \n \n \n THE CONSTITUTION OF INDIA  \n [As on 26th November , 2021 ] \n \n \n  ', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='b3db4d91-b085-45f5-b390-8fc4714585d9', embedding=None, metadata={'page_label': '2', 'file_name': 'COI...pdf', 'fi

In [9]:
 # create vector store index
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [10]:
# set up query engine
query_engine = index.as_query_engine(llm=llm)

In [11]:
response = query_engine.query("What did the author do growing up?")
print(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    6579.64 ms
llama_print_timings:      sample time =      33.59 ms /   256 runs   (    0.13 ms per token,  7622.45 tokens per second)
llama_print_timings: prompt eval time =   54743.72 ms /  1953 tokens (   28.03 ms per token,    35.68 tokens per second)
llama_print_timings:        eval time =   25273.92 ms /   255 runs   (   99.11 ms per token,    10.09 tokens per second)
llama_print_timings:       total time =   80632.27 ms /  2208 tokens


  Based on the context provided in the essay, the author did the following growing up:
1. Wrote short stories outside of school, focusing on characters with strong feelings instead of developing any actual plots.
2. Used an IBM 1401 computer at school to learn programming in an early version of Fortran, but found it difficult to create programs that did anything meaningful due to the limited input options.
3. Became fascinated with microcomputers when a friend built one himself, leading the author to convince their father to purchase a TRS-80 computer several years later.
4. Developed simple games, predicted the flight of model rockets, and created a word processor for their father to use on their TRS-80.
5. Initially intended to study philosophy in college but found it boring, leading them to switch to artificial intelligence (AI) instead.
6. Learned Lisp and reverse-engineered SHRDLU for their undergraduate thesis, which they believed had the potential to climb the lower slopes of in

In [None]:
# who wrote software to generate web sites
while True:
    input_query=input("To exit type 'exit' : ")
    if input_query=="exit":
        break
    else:
        response = query_engine.query(input_query)
        print(response)
    
    

