In [1]:
"""
# FOR GOOGLE COLAB

!git clone https://github.com/carlosdeabreu87/demo-oss-llm-haystack.git
"""

'\n# FOR GOOGLE COLAB\n\n!git clone https://github.com/carlosdeabreu87/demo-oss-llm-haystack.git\n'

In [2]:
"""
# FOR GOOGLE COLAB

import os

# Change the current working directory to 'project'
os.chdir('/content/demo-oss-llm-haystack')

# Verify the current working directory
print("Current Working Directory: ", os.getcwd())
"""

'\n# FOR GOOGLE COLAB\n\nimport os\n\n# Change the current working directory to \'project\'\nos.chdir(\'/content/demo-oss-llm-haystack\')\n\n# Verify the current working directory\nprint("Current Working Directory: ", os.getcwd())\n'

In [3]:
"""
# FOR GOOGLE COLAB

!pip install -r ./requirements.txt
"""

'\n# FOR GOOGLE COLAB\n\n!pip install -r ./requirements.txt\n'

In [4]:
"""
# FOR GOOGLE COLAB

!huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.1-GGUF mistral-7b-instruct-v0.1.Q4_K_M.gguf --local-dir ./models --local-dir-use-symlinks False
"""

'\n# FOR GOOGLE COLAB\n\n!huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.1-GGUF mistral-7b-instruct-v0.1.Q4_K_M.gguf --local-dir ./models --local-dir-use-symlinks False\n'

In [5]:
import os
import gradio as gr
from haystack.document_stores import InMemoryDocumentStore, WeaviateDocumentStore
from haystack.nodes import (
    EmbeddingRetriever
    ,PreProcessor
    ,TextConverter
    ,PromptNode
    ,PromptTemplate
    ,TopPSampler
    ,PromptModel
    )
from haystack.nodes.ranker import LostInTheMiddleRanker
from haystack.pipelines import Pipeline
from pathlib import Path
from llm_model_config import LlamaCPPInvocationLayer
from pdf_to_txt_converter import pdf_to_txt_converter
import warnings
warnings.filterwarnings("ignore")
#warnings.filterwarnings("ignore", message="*DeprecationWarning*")
#warnings.filterwarnings("ignore", message="*UserWarning*")
#warnings.filterwarnings("ignore", message="*Tqdm*")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  import pkg_resources
  _torch_pytree._register_pytree_node(


In [6]:
## Auxiliary function for converting pdf files contents to PDF

# pdf_to_txt_converter(pdf_file_path = 'data/se_best_practices.pdf', txt_file_path = 'data/se_best_practices.txt')

In [7]:
## Definition of the nodes for the pipeline

document_store = InMemoryDocumentStore(embedding_dim = 384)

converter = TextConverter()

preprocessor = PreProcessor(
                            clean_empty_lines = True
                           ,clean_whitespace = False
                           ,clean_header_footer = True
                           ,split_by = "word"
                           ,split_length = 250
                           ,split_respect_sentence_boundary = True
                           )

embedding_retriever = EmbeddingRetriever(
                                          document_store = document_store 
                                         ,embedding_model = "sentence-transformers/All-MiniLM-L6-V2" 
                                         ,model_format = "sentence_transformers" 
                                         ,top_k=10
                                        )

llm_model = PromptModel(
                        model_name_or_path = "models/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
                       ,invocation_layer_class = LlamaCPPInvocationLayer
                       ,use_gpu = True
                       ,max_length = 512
                       )


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 lla

In [8]:
## Function for building the indexing the pipeline which stores the embedded documents in the vector store

def indexing_pipeline(local_dir):

    # Iterate over files in the directory
    for filename in os.listdir(local_dir):
        file_path = Path(local_dir) / filename
        
        # Check if the file is a text file
        if file_path.suffix == '.txt':
            documents = converter.convert(file_path = str(file_path), meta=None)

    preprocessed_docs = preprocessor.process(documents)

    document_store.write_documents(preprocessed_docs)

    document_store.update_embeddings(embedding_retriever)


indexing_pipeline(local_dir = "data")

Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 37.04docs/s]
Batches: 100%|██████████| 1/1 [00:10<00:00, 10.21s/it]ocs/s]
Documents Processed: 10000 docs [00:10, 977.51 docs/s]       


In [9]:
## Function for building the query the pipeline which takes the user prompts, enrich its, query the vector store and passes it down to the LLM for the final response

def query_pipeline(query):
    
    prompt_text = """
Synthesize a comprehensive answer from the provided paragraphs of a 
paper and the given question.\n
Focus on the question and avoid unnecessary information in your answer.\n
\n\n Paragraphs: {join(documents)} \n\n Question: {query} \n\n Answer:
"""

    prompt_node = PromptNode(
                             model_name_or_path = llm_model
                            ,default_prompt_template = PromptTemplate(prompt_text)
                            ,max_length = 384
                            ,model_kwargs = {"stream": False}
                            )   

    
    query_pipeline = Pipeline()

    query_pipeline.add_node(
                            component = embedding_retriever
                           ,name = "Retriever"
                           ,inputs = ["Query"]
                           )
    
    query_pipeline.add_node(
                            component = TopPSampler(top_p = 0.90)
                           ,name = "Sampler"
                           ,inputs = ["Retriever"]
                           )
    
    query_pipeline.add_node(
                            component = LostInTheMiddleRanker(1024)
                           ,name = "LostInTheMiddleRanker"
                           ,inputs = ["Sampler"]
                           )
    
    query_pipeline.add_node(
                            component = prompt_node
                           ,name = "Prompt"
                           ,inputs = ["LostInTheMiddleRanker"]
                           )

    pipeline_obj = query_pipeline.run(query = query)
    
    return pipeline_obj["results"]

In [10]:
## Example 1

response = query_pipeline(query = "What is this document about?")
response[0]

Batches: 100%|██████████| 1/1 [00:00<00:00, 39.08it/s]

llama_print_timings:        load time =  131951.40 ms
llama_print_timings:      sample time =      11.16 ms /    16 runs   (    0.70 ms per token,  1433.18 tokens per second)
llama_print_timings: prompt eval time =  224847.32 ms /   459 tokens (  489.86 ms per token,     2.04 tokens per second)
llama_print_timings:        eval time =   11933.76 ms /    15 runs   (  795.58 ms per token,     1.26 tokens per second)
llama_print_timings:       total time =  236971.76 ms /   474 tokens


'This document is about software development methodologies and how they can be improved to be'

In [11]:
## Example 2

response = query_pipeline(query = "Is Carlos De Abreu the author of this draft?")
response[0]

Batches: 100%|██████████| 1/1 [00:00<00:00, 47.62it/s]
Llama.generate: prefix-match hit

llama_print_timings:        load time =  131951.40 ms
llama_print_timings:      sample time =      10.63 ms /    16 runs   (    0.66 ms per token,  1505.60 tokens per second)
llama_print_timings: prompt eval time =   49779.97 ms /   105 tokens (  474.09 ms per token,     2.11 tokens per second)
llama_print_timings:        eval time =   12066.40 ms /    15 runs   (  804.43 ms per token,     1.24 tokens per second)
llama_print_timings:       total time =   62027.62 ms /   120 tokens


'\n Carlos De Abreu is not the author of this draft. The paragraph'

In [12]:
## Example 3

response = query_pipeline(query = "who the authors of this draft?")
response[0]

Batches: 100%|██████████| 1/1 [00:00<00:00, 28.58it/s]
Llama.generate: prefix-match hit

llama_print_timings:        load time =  131951.40 ms
llama_print_timings:      sample time =      12.57 ms /    16 runs   (    0.79 ms per token,  1272.77 tokens per second)
llama_print_timings: prompt eval time =  211004.42 ms /   413 tokens (  510.91 ms per token,     1.96 tokens per second)
llama_print_timings:        eval time =   13397.31 ms /    15 runs   (  893.15 ms per token,     1.12 tokens per second)
llama_print_timings:       total time =  224597.85 ms /   428 tokens


'The authors of this draft are Daniel Huttenlocher and Daniel Spoon'

In [13]:
## Interface to submit user prompts to the LLM using the query pipeline

def ask_llm(prompt):
    answer = query_pipeline(query = prompt)
    answer = answer[0]
    return answer

demo = gr.Interface(fn=ask_llm, inputs="textbox", outputs="textbox")
    
if __name__ == "__main__":
    demo.launch(inbrowser=True)   

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
