In [2]:
# ingestion_app.py

# Import required modules
#from config import *
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, Settings
from llama_index.core.node_parser import SimpleNodeParser, MarkdownNodeParser
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
import os
import re
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configuration for document ingestion in Weaviate
#EMBEDDING_DIM = 384 # Weaviate bug. Required to set the embedding dimension for the vector store
# Configuration for the LLM Ollama model
LLM_BASE_URL = '***REMOVED***'
LLM_MODEL_NAME = 'llama3'
LLM_KEEP_ALIVE = 0
LLM_REQUEST_TIMEOUT = 60
LLM_TEMPERATURE = 0 # 0.75

# Configuration for text embedding inference
EMBED_BASE_URL = "***REMOVED***"
EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
EMBED_TIMEOUT = 60
EMBED_BATCH_SIZE = 10

# Configuration for chunks
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 20

# Configuration for Weaviate
WEAVIATE_URL = 'http://localhost:8080/'

# Configuration for directories and index names
INPUT_DIR = "expert_docs/context_docs_Assembly_1/oficial_docs/transformations01"
INDEX_NAME = "Documents_acc01"

In [4]:
# Function to set up the embeddings model
def setup_embedding_model():
    return TextEmbeddingsInference(
        model_name=EMBED_MODEL_NAME,
        base_url=EMBED_BASE_URL,
        timeout=EMBED_TIMEOUT, 
        embed_batch_size=EMBED_BATCH_SIZE
    )

# Function to check and return the input directory
def get_input_directory(base_dir, input_dir):
    input_dir_path = os.path.join(base_dir, input_dir)
    if os.path.exists(input_dir_path):
        print(f"El directorio {INPUT_DIR} está listo para ser utilizado.")
    else:
        raise FileNotFoundError(f"Error: El directorio {INPUT_DIR} no existe. Verifica la configuración.")
    return input_dir_path

# Function to connect to Weaviate
def connect_to_weaviate():
    weaviate_url = os.getenv('WEAVIATE_URL', 'http://localhost:8080/')
    print(f"Conectando a Weaviate en {weaviate_url}")
    return weaviate.connect_to_local()

# Function to get file paths from a directory
def get_file_paths(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

# Function to read documents from directory, or from a list of files
def read_documents(input, type='dir'):
    try:
        reader = None
        if type == 'dir':
            reader = SimpleDirectoryReader(input_dir=input, recursive=True)
        elif type == 'files':
            reader = SimpleDirectoryReader(input_files=input)
        return reader.load_data()
    except Exception as e:
        print(f"Error al leer los documentos: {e}")
        return None


# Función para extraer los metadatos de un documento
def extract_variables(text):
    # La estructura del encabezado con los metadatos debe ser:
    # ---
    # title: ...
    # author: ...; ...; ...
    # keywords: ...; ...; ...
    # ---
    title_match = re.search(r'title:\s*(.*)', text)
    author_match = re.search(r'author:\s*(.*)', text)
    keywords_match = re.search(r'keywords:\s*(.*)', text)
    
    title = title_match.group(1) if title_match else None
    author = author_match.group(1) if author_match else None
    keywords = keywords_match.group(1) if keywords_match else None
    
    return title, author, keywords

In [5]:

# Function to parse documents into nodes
def parse_documents(documents, parser_type='markdown'):
    if parser_type == 'markdown':
        parser = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True)
    else:
        parser = SimpleNodeParser.from_defaults(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    nodes = parser.get_nodes_from_documents(documents)
    print(f"Parsing process completed. Documents processed: {len(documents)}. Nodes generated: {len(nodes)}.")
    return nodes

# Function to set up vector store and storage context
def setup_vector_store_and_context(weaviate_client, index_name, embed_model):
    if weaviate_client.collections.exists(index_name):
        print(f"Eliminando la colección existente: {index_name}")
        weaviate_client.collections.delete(index_name)
    print(f"Configurando y construyendo el índice: {index_name}")
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, 
                                       index_name=index_name, 
                                       embed_model=embed_model,
                                       text_key="text")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return storage_context



In [17]:
# Main function to execute the ingestion process
#def execute_ingestion():
# def execute_ingestion - Initialize models and settings
embed_model = setup_embedding_model()
Settings.embed_model = embed_model
# def execute_ingestion - Get input directory
#BASE_DIR = os.path.dirname(os.path.abspath(__file__))
#input_dir = get_input_directory(BASE_DIR, INPUT_DIR)
# def execute_ingestion - Connect to Weaviate
weaviate_client = connect_to_weaviate()
# def execute_ingestion - Read and parse documents
# documents = read_documents(INPUT_DIR, type='dir')
file_paths = get_file_paths(INPUT_DIR)


Conectando a Weaviate en http://localhost:8080/


In [18]:
weaviate_client.collections.delete("Documents_acc01")

In [36]:
#documents[:5]

In [37]:
print(file_paths)

['expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_agroalimentacion_v2_esp.md', 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_energia_esp_custom.md']


In [9]:
parser_type='markdown'
if parser_type == 'markdown':
    parser = MarkdownNodeParser()
else:
    parser = SimpleNodeParser.from_defaults(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

nodes = []
i = 0
for file in file_paths:
    print(f"Procesando el documento: {file}")
    docs = read_documents([file], type='files')
    doc_metadata = docs[0].text
    # Extracción de metadatos
    title, author, keywords = extract_variables(doc_metadata)

    # Imprimir los resultados
    print("Title:", title)
    print("Author:", author)
    print("Keywords:", keywords)

    docs.pop(0) # Eliminar el primer elemento que contiene los metadatos
    print(f"Documentos leídos: {len(docs)}")

    base_nodes = parser.get_nodes_from_documents(docs)
    for idx, node in enumerate(base_nodes):
        #node.id_ = f"node-{idx+i}"
        #node.metadata[f"info_type"] = itchoice=random.choice(info_types)
        node.metadata[f"title"] = title
        node.metadata[f"author"] = author
        node.metadata[f"keywords"] = keywords

    i = idx
    nodes.extend(base_nodes)

print(f"Parsing process completed. Nodes generated: {len(nodes)}.")

Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_agroalimentacion_v2_esp.md
Title: EL SISTEMA AGROALIMENTARIO EN CATALUÑA - ASAMBLEA CIUDADANA POR EL CLIMA DE CATALUÑA
Author: Departamento de Acción Climática, Alimentación y Agenda Rural; Reguant Fosas, Francesc; Rivera Ferré, Marta G.; Riu, Manel
Keywords: agroecología; sistema agroalimentario intensivo; sostenibilidad, emisiones
Documentos leídos: 75
Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_energia_esp_custom.md
Title: LA TRANSICIÓN ENERGÉTICA EN CATALUNA - ASAMBLEA CIUDADANA POR EL CLIMA DE CATALUÑA
Author: Departamento de Acción Climática, Alimentación y Agenda Rural; Riba, Carles; Cugat, Xavier; Riu, Manel
Keywords: transición energética; descentralización; energías renovables; autoconsumo
Documentos leídos: 75
Parsing process completed. Nodes generated: 150.


In [10]:

nodes

[TextNode(id_='8a17d50b-0ad2-4adf-a0b7-34331e9905a2', embedding=None, metadata={'file_path': 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_agroalimentacion_v2_esp.md', 'file_name': 'accc_kit-informativo_agroalimentacion_v2_esp.md', 'file_type': 'text/markdown', 'file_size': 89766, 'creation_date': '2024-05-18', 'last_modified_date': '2024-05-18', 'title': 'EL SISTEMA AGROALIMENTARIO EN CATALUÑA - ASAMBLEA CIUDADANA POR EL CLIMA DE CATALUÑA', 'author': 'Departamento de Acción Climática, Alimentación y Agenda Rural; Reguant Fosas, Francesc; Rivera Ferré, Marta G.; Riu, Manel', 'keywords': 'agroecología; sistema agroalimentario intensivo; sostenibilidad, emisiones'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRela

In [14]:
# Set up vector store and storage context
storage_context = setup_vector_store_and_context(weaviate_client, INDEX_NAME, embed_model)
# Build the index with parsed nodes
index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)
print(f"Índice {INDEX_NAME} construido con éxito.")


Configurando y construyendo el índice: Documents_md3


Generating embeddings:   0%|          | 0/150 [00:00<?, ?it/s]

Generating embeddings: 100%|██████████| 150/150 [00:11<00:00, 12.91it/s]


Índice Documents_md3 construido con éxito.


In [15]:
weaviate_client.close()

---

In [12]:
# collections = weaviate_client.collections.get(INDEX_NAME)
# result = collections.query.fetch_object_by_id('1fac1a8e-4d45-4b80-b402-d233aa431ad2')
# 
# result

ObjectSingleReturn(uuid=_WeaviateUUIDInt('1fac1a8e-4d45-4b80-b402-d233aa431ad2'), metadata=MetadataSingleObjectReturn(creation_time=datetime.datetime(2024, 5, 18, 5, 58, 39, 129000, tzinfo=datetime.timezone.utc), last_update_time=datetime.datetime(2024, 5, 18, 5, 58, 39, 129000, tzinfo=datetime.timezone.utc), is_consistent=None), properties={'_node_type': 'TextNode', 'last_modified_date': '2024-05-18', 'file_size': 108650.0, 'document_id': UUID('e297396d-44c5-4369-adf4-fd8e790d451e'), 'relationships': None, 'ref_doc_id': 'e297396d-44c5-4369-adf4-fd8e790d451e', 'keywords': 'transición energética; descentralización; energías renovables; autoconsumo\r', 'doc_id': UUID('e297396d-44c5-4369-adf4-fd8e790d451e'), 'text': 'PROPUESTAS SOBRE APODERAMIENTO CIUDADANO', 'file_type': 'text/markdown', 'file_path': 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/accc_kit-informativo_energia_esp_custom.md', 'creation_date': '2024-05-18', 'title': 'LA TRANSICIÓN ENERGÉTICA EN CATALUNA