<a href="https://colab.research.google.com/github/cbadenes/notebooks/blob/main/nlp/local_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Parameters and Constants:

In [None]:
output_dir = 'my-docs' # Path to output directory
input_dir = 'input_files' # Path to input directory
weaviate_url = "http://localhost:8080"
embedding_model_name = 'all-MiniLM-L6-v2'
device = 'mps'

# Helper Functions to Ingest Documents and Pre-Process Them:

In [None]:
import subprocess
import os
from typing import List, Dict
from userpaths import get_my_documents


def process_local(output_dir: str, num_processes: int, input_path: str):
        command = [
          "unstructured-ingest",
          "local",
          "--input-path", input_path,
          "--output-dir", output_dir,
          "--num-processes", str(num_processes),
          "--recursive",
          "--verbose",
        ]

        # Run the command
        process = subprocess.Popen(command, stdout=subprocess.PIPE)
        output, error = process.communicate()

        # Print output
        if process.returncode == 0:
            print('Command executed successfully. Output:')
            print(output.decode())
        else:
            print('Command failed. Error:')
            print(error.decode())

def get_result_files(folder_path) -> List[Dict]:
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

In [None]:
process_local(output_dir=output_dir, num_processes=2, input_path=input_dir)
files = get_result_files(output_dir)

2023-10-26 15:46:32,913 MainProcess DEBUG    options: {'input_path': 'input_files', 'output_dir': 'my-docs', 'num_processes': 2, 'recursive': True, 'verbose': True, 'file_glob': None, 'download_dir': None, 're_download': False, 'preserve_downloads': False, 'download_only': False, 'max_docs': None, 'pdf_infer_table_structure': False, 'strategy': 'auto', 'reprocess': False, 'ocr_languages': 'eng', 'encoding': None, 'fields_include': ['element_id', 'text', 'type', 'metadata'], 'flatten_metadata': False, 'metadata_include': [], 'metadata_exclude': [], 'partition_by_api': False, 'partition_endpoint': 'https://api.unstructured.io/general/v0/general', 'api_key': None}
2023-10-26 15:46:33,043 MainProcess INFO     Processing 9 docs
2023-10-26 15:46:35,757 SpawnPoolWorker-2 INFO     Processing input_files/BOE-A-2023-2629.pdf
2023-10-26 15:46:35,757 SpawnPoolWorker-2 DEBUG    Using local partition
The ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages

Command executed successfully. Output:



# Helper Functions to Setup Weaviate Schema and Client

In [None]:
import uuid
import weaviate
from weaviate.util import get_valid_uuid

def create_local_weaviate_client(db_url: str):
    return weaviate.Client(
        url=db_url,
    )

def get_schema(vectorizer: str = "none"):
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "properties": [
                    {
                        "name": "last_modified",
                        "dataType": ["text"],
                        "description": "Last modified date for the document",
                    },
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

def count_documents(client: weaviate.Client) -> Dict:
    response = (
        client.query
        .aggregate("Doc")
        .with_meta_count()
        .do()
    )
    count = response
    return count

# Setup Weaviate Client and Schema

In [None]:
client = create_local_weaviate_client(db_url=weaviate_url)
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

# Helper Functions to Stage Unstructured Documents for Indexing

In [None]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(embedding_model_name, device=device)

def compute_embedding(chunk_text: List[str]):
    embeddings = embedding_model.encode(chunk_text, device=device)
    return embeddings

def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        chunks[i] = {"last_modified": chunks[i].metadata.last_modified, "text": chunks[i].text}

    chunk_texts = [x['text'] for x in chunks]
    embeddings = compute_embedding(chunk_texts)
    return chunks, embeddings


def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks, embeddings = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            client.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
                vector=embeddings[i]
            )

    client.batch.flush()

# Add Chunks to Weaviate

In [None]:
add_data_to_weaviate(
    files=files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=500
)

print(count_documents(client=client)['data']['Aggregate']['Doc'])

Uploading 128 chunks for my-docs/BOE-A-2023-2628.pdf.json.
Uploading 15 chunks for my-docs/BOE-A-2023-2629.pdf.json.
Uploading 3 chunks for my-docs/BOE-A-2023-2625.pdf.json.
Uploading 2 chunks for my-docs/BOE-A-2023-2624.pdf.json.
Uploading 61 chunks for my-docs/BOE-A-2023-2632.pdf.json.
Uploading 33 chunks for my-docs/BOE-A-2023-2631.pdf.json.
Uploading 11 chunks for my-docs/BOE-A-2023-2630.pdf.json.
Uploading 2 chunks for my-docs/BOE-A-2023-2626.pdf.json.
Uploading 2 chunks for my-docs/BOE-A-2023-2627.pdf.json.
[{'meta': {'count': 257}}]


# LangChain RAG Application

In [None]:
from langchain.llms import LlamaCpp
from langchain.vectorstores.weaviate import Weaviate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

# Instantiate Local Llama 2 LLM

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 100  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="model_files/llama-2-7b-chat.Q4_K_S.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048, # context window. By default 512
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True, # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model_files/llama-2-7b-chat.Q4_K_S.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,   

# Helper Function to Run RAG Process

In [None]:
def question_answer(question: str, vectorstore: Weaviate):
    embedding = compute_embedding(question)
    similar_docs = vectorstore.max_marginal_relevance_search_by_vector(embedding)
    content = [x.page_content for x in similar_docs]
    prompt_template = PromptTemplate.from_template(
    """\
    Dado el contexto sobre el tema, responde a la pregunta basándote en el contexto proporcionado de la mejor manera posible.
    Contexto: {context}
    Pregunta:
    {question}
    Respuesta:
    """
    )
    prompt = prompt_template.format(context=content, question=question)
    answer = llm(prompt)
    return answer, similar_docs

# Run RAG on a Question

In [None]:
client = weaviate.Client(weaviate_url)
vectorstore = Weaviate(client, "Doc", "text")

question = "¿En que BOE se propone la creación de la escuela El Cornetí?"

answer, similar_docs = question_answer(question, vectorstore)

print("\n\n\n-------------------------")
print(f"QUERY: {question}")
print("\n\n\n-------------------------")
print(f"Answer: {answer}")
print("\n\n\n-------------------------")
for index, result in enumerate(similar_docs):
    print(f"\n\n-- RESULT {index+1}:\n")
    print(result)

Llama.generate: prefix-match hit


 La pregunta es incorrecta, ya que en el contexto proporcionado no se menciona la creación de una escuela llamada "El Cornetín". En realidad, se trata de un documento oficial del Estado español que se refiere a la transferencia de la Dependencia de Sanidad y Política Social de Córdoba a la localidad de Motril, haciéndola depender de la Subdelegación del Gobierno en Granada. Por lo tanto, no hay relación alguna con la creación de una escuela llamada "El Cornetín".


-------------------------
QUERY: ¿En que BOE se propone la creación de la escuela El Cornetí?



-------------------------
Answer:  La pregunta es incorrecta, ya que en el contexto proporcionado no se menciona la creación de una escuela llamada "El Cornetín". En realidad, se trata de un documento oficial del Estado español que se refiere a la transferencia de la Dependencia de Sanidad y Política Social de Córdoba a la localidad de Motril, haciéndola depender de la Subdelegación del Gobierno en Granada. Por lo tanto, no hay r


llama_print_timings:        load time =  4920.52 ms
llama_print_timings:      sample time =    91.52 ms /   129 runs   (    0.71 ms per token,  1409.53 tokens per second)
llama_print_timings: prompt eval time =  2616.66 ms /   765 tokens (    3.42 ms per token,   292.36 tokens per second)
llama_print_timings:        eval time =  2632.12 ms /   128 runs   (   20.56 ms per token,    48.63 tokens per second)
llama_print_timings:       total time =  5511.17 ms
