In [1]:
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, Settings
from llama_index.core.node_parser import SimpleNodeParser, MarkdownNodeParser
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
from weaviate.connect import ConnectionParams
import os
import re
from tqdm import tqdm
import yaml
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
dotenv_path = '../config.env'
load_dotenv(dotenv_path=dotenv_path)

True

In [3]:
EMBED_MODEL_NAME = os.getenv('EMBED_MODEL_NAME')
EMBED_BASE_URL = os.getenv('EMBED_BASE_URL')
EMBED_TIMEOUT = int(os.getenv('EMBED_TIMEOUT'))
EMBED_BATCH_SIZE = int(os.getenv('EMBED_BATCH_SIZE'))

LLM_MODEL_NAME = os.getenv('LLM_MODEL_NAME')
LLM_KEEP_ALIVE = int(os.getenv('LLM_KEEP_ALIVE'))
LLM_REQUEST_TIMEOUT = int(os.getenv('LLM_REQUEST_TIMEOUT'))
LLM_TEMPERATURE = float(os.getenv('LLM_TEMPERATURE'))
LLM_BASE_URL = os.getenv('LLM_BASE_URL')

WEAVIATE_HOST = os.getenv('WEAVIATE_HOST')
WEAVIATE_PORT = '8090' #os.getenv('WEAVIATE_PORT')
WEAVIATE_GRPC_PORT = '50061' #os.getenv('WEAVIATE_GRPC_PORT')
WEAVIATE_URL = os.getenv('WEAVIATE_URL')

INDEX_NAME = os.getenv('INDEX_NAME')

CHUNK_SIZE = int(os.getenv('CHUNK_SIZE'))
CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP'))



In [4]:
CHUNK_SIZE

512

In [5]:
# Configuration for directories and index names
INPUT_DIR = "expert_docs/context_docs_Assembly_1/oficial_docs/transformations01"
INDEX_NAME = "Documents_acc01"


In [6]:
print("Weaviate URL: ", WEAVIATE_URL)

Weaviate URL:  http://weviate:8080/


In [7]:
# Function to set up the embeddings model
def setup_embedding_model():
    return TextEmbeddingsInference(
        model_name=EMBED_MODEL_NAME,
        base_url=EMBED_BASE_URL,
        timeout=EMBED_TIMEOUT, 
        embed_batch_size=EMBED_BATCH_SIZE
    )

# Function to check and return the input directory
def get_input_directory(base_dir, input_dir):
    input_dir_path = os.path.join(base_dir, input_dir)
    if os.path.exists(input_dir_path):
        print(f"El directorio {INPUT_DIR} está listo para ser utilizado.")
    else:
        raise FileNotFoundError(f"Error: El directorio {INPUT_DIR} no existe. Verifica la configuración.")
    return input_dir_path

# Function to connect to Weaviate
def connect_to_weaviate():
    print(f"Conectando a Weaviate en {WEAVIATE_URL}")
    #return weaviate.connect_to_local(host=WEAVIATE_HOST, port=WEAVIATE_PORT)
    return weaviate.connect_to_custom(
                http_host=WEAVIATE_HOST,
                http_port=WEAVIATE_PORT,
                http_secure=False,
                grpc_host="localhost", # WEAVIATE_HOST
                grpc_port=WEAVIATE_GRPC_PORT,
                grpc_secure=False,
            )

# Function to get file paths from a directory
def get_file_paths(directory):
    file_paths = []
    with os.scandir(directory) as entries:
        for entry in entries:
            # Verificar si el elemento es un fichero
            if entry.is_file():
                file_paths.append(entry.path)
    return file_paths

# Function to read documents from directory, or from a list of files
def read_documents(input, type='dir'):
    try:
        reader = None
        if type == 'dir':
            reader = SimpleDirectoryReader(input_dir=input, recursive=True)
        elif type == 'files':
            reader = SimpleDirectoryReader(input_files=input)
        return reader.load_data()
    except Exception as e:
        print(f"Error al leer los documentos: {e}")
        return None

def load_custom(code):
    file_path = f'../orchestator/custom_{code}.yml'
    with open(file_path, 'r', encoding='utf-8') as file:
        templates = yaml.safe_load(file)
    return templates

# Función para extraer los metadatos de un documento
def extract_variables(text, pathfile):
    # La estructura del encabezado con los metadatos debe ser:
    # ---
    # title: ...
    # author: ...; ...; ...
    # keywords: ...; ...; ...
    # --- Experimental para dos ficheros, todos tienen title y url en custom_ES_01.yml

    # Documents metadata
    custom_templates = load_custom('ES_01')
    dict_files = custom_templates['DICT_FILES']

    file = os.path.basename(pathfile)
    title_yml = dict_files.get(file, "").get('title')
    url_yml = dict_files.get(file, "").get('url')
    
    title_match = re.search(r'title:\s*(.*)', text)
    author_match = re.search(r'author:\s*(.*)', text)
    keywords_match = re.search(r'keywords:\s*(.*)', text)

    title = title_match.group(1) if title_match else title_yml
    author = author_match.group(1) if author_match else None
    keywords = keywords_match.group(1) if keywords_match else None
    url = url_yml
    
    return title, author, keywords, url

In [8]:

# Function to parse documents into nodes
def parse_documents(documents, parser_type='markdown'):
    if parser_type == 'markdown':
        parser = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True)
    else:
        parser = SimpleNodeParser.from_defaults(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    nodes = parser.get_nodes_from_documents(documents)
    print(f"Parsing process completed. Documents processed: {len(documents)}. Nodes generated: {len(nodes)}.")
    return nodes

# Function to set up vector store and storage context
def setup_vector_store_and_context(weaviate_client, index_name, embed_model):
    #if weaviate_client.collections.exists(index_name):
    #    print(f"Eliminando la colección existente: {index_name}")
    #    weaviate_client.collections.delete(index_name)
    print(f"Configurando y construyendo el índice: {index_name}")
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, 
                                       index_name=index_name, 
                                       embed_model=embed_model,
                                       text_key="text")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return storage_context



In [9]:
# Main function to execute the ingestion process
#def execute_ingestion():
# def execute_ingestion - Initialize models and settings
embed_model = setup_embedding_model()
Settings.embed_model = embed_model
# def execute_ingestion - Get input directory
#BASE_DIR = os.path.dirname(os.path.abspath(__file__))
#input_dir = get_input_directory(BASE_DIR, INPUT_DIR)
# def execute_ingestion - Connect to Weaviate
weaviate_client = connect_to_weaviate()
# def execute_ingestion - Read and parse documents
# documents = read_documents(INPUT_DIR, type='dir')
file_paths = get_file_paths(INPUT_DIR)


Conectando a Weaviate en http://weviate:8080/


In [10]:
print(file_paths)

['expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Ley del cambio climático 16.2017- 1 agosto - consolidado.md', 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Impactos y Riesgos Derivados del Cambio Climático en España.md', 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/pnacc-2021-2030 Plan Nacional de Adaptación al Cambio Climático.md', 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Observatorio de Políticas Ambientales 2022-OPAM.md']


In [76]:
#weaviate_client.collections.delete("Documents_acc01")

In [11]:
CHUNK_SIZE

512

In [12]:
# Now, we will read the documents and parse them into nodes

#parser_type='markdown'
parser_type='simple'

if parser_type == 'markdown':
    parser = MarkdownNodeParser( )
else:
    parser = SimpleNodeParser.from_defaults(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# Set up vector store and storage context
storage_context = setup_vector_store_and_context(weaviate_client, INDEX_NAME, embed_model)

nodes = []
i = 0
for file in file_paths:
    print(f"Procesando el documento: {file}")
    docs = read_documents([file], type='files')
    doc_metadata = docs[0].text
    # Obtain the metadata from the document
    title, author, keywords, url = extract_variables(doc_metadata, file)

    print("Title:", title)
    print("Author:", author)
    print("Keywords:", keywords)
    print("URL:", url)

    docs.pop(0) # Eliminar el primer elemento que contiene los metadatos
    print(f"Documentos leídos: {len(docs)}")

    # Parse the documents into nodes
    base_nodes = parser.get_nodes_from_documents(docs)

    # Add metadata to the nodes
    for idx, node in enumerate(base_nodes):
        #node.id_ = f"node-{idx+i}"
        #node.metadata[f"info_type"] = itchoice=random.choice(info_types)
        node.metadata[f"title"] = title
        node.metadata[f"author"] = author
        node.metadata[f"keywords"] = keywords
        node.metadata[f"url"] = url

    i = idx
    nodes.extend(base_nodes)

    # Build the index with parsed nodes
    try:
        index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)
        print(f"Documento {file} insertado en el índice {INDEX_NAME}.")
        # Move files to another directory
        os.rename(file, f"{INPUT_DIR}/ingested/{os.path.basename(file)}")
    except Exception as e:
        print(f"Error al insertar el documento {file} en el índice {INDEX_NAME}: {e}")


print(f"Parsing process completed. Nodes generated: {len(nodes)}.")

Configurando y construyendo el índice: Documents_acc01
Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Ley del cambio climático 16.2017- 1 agosto - consolidado.md
Title: Ley del cambio climático de Cataluña 16.2017, de 1 de agosto, consolidado
Author: None
Keywords: None
URL: https://www.boe.es/buscar/pdf/2017/BOE-A-2017-11001-consolidado.pdf
Documentos leídos: 3


Generating embeddings: 100%|██████████| 121/121 [00:07<00:00, 15.33it/s]


Documento expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Ley del cambio climático 16.2017- 1 agosto - consolidado.md insertado en el índice Documents_acc01.
Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Impactos y Riesgos Derivados del Cambio Climático en España.md
Title: Impactos y riesgos derivados del cambio climático en España 2021
Author: None
Keywords: None
URL: https://www.miteco.gob.es/es/ceneam/recursos/pag-web/impactos-cambio-climatico-espana.html
Documentos leídos: 115


Generating embeddings: 100%|██████████| 748/748 [00:48<00:00, 15.51it/s]


Documento expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Impactos y Riesgos Derivados del Cambio Climático en España.md insertado en el índice Documents_acc01.
Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/pnacc-2021-2030 Plan Nacional de Adaptación al Cambio Climático.md
Title: Plan Nacional de Adaptación al Cambio Climático 2021-2030
Author: None
Keywords: None
URL: https://www.miteco.gob.es/content/dam/miteco/es/cambio-climatico/temas/impactos-vulnerabilidad-y-adaptacion/pnacc-2021-2030_tcm30-512163.pdf
Documentos leídos: 18


Generating embeddings: 100%|██████████| 1179/1179 [01:14<00:00, 15.73it/s]


Documento expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/pnacc-2021-2030 Plan Nacional de Adaptación al Cambio Climático.md insertado en el índice Documents_acc01.
Procesando el documento: expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Observatorio de Políticas Ambientales 2022-OPAM.md
Title: Observatorio de Políticas Ambientales 2022
Author: None
Keywords: None
URL: https://transparencia.gob.es/transparencia/transparencia_Home/index/MasInformacion/Informes-de-interes/Medio_ambiente/ObservatorioPoliticasAmbientales2022.html
Documentos leídos: 12


Generating embeddings: 100%|██████████| 2048/2048 [02:11<00:00, 15.58it/s]
Generating embeddings: 100%|██████████| 1569/1569 [01:41<00:00, 15.45it/s]


Documento expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Observatorio de Políticas Ambientales 2022-OPAM.md insertado en el índice Documents_acc01.
Parsing process completed. Nodes generated: 3617.


In [31]:

nodes

[TextNode(id_='8158b163-4618-42cb-913c-88e5a795bcb9', embedding=None, metadata={'file_path': 'expert_docs/context_docs_Assembly_1/oficial_docs/transformations01/Observatorio de Políticas Ambientales 2022-OPAM.md', 'file_name': 'Observatorio de Políticas Ambientales 2022-OPAM.md', 'file_type': 'text/markdown', 'file_size': 3645184, 'creation_date': '2024-06-18', 'last_modified_date': '2024-06-18', 'title': 'Observatorio de Políticas Ambientales 2022', 'author': None, 'keywords': None, 'url': 'https://transparencia.gob.es/transparencia/transparencia_Home/index/MasInformacion/Informes-de-interes/Medio_ambiente/ObservatorioPoliticasAmbientales2022.html'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c6ef5f96

In [None]:
# # Set up vector store and storage context
# storage_context = setup_vector_store_and_context(weaviate_client, INDEX_NAME, embed_model)
# # Build the index with parsed nodes
# index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)
# print(f"Índice {INDEX_NAME} construido con éxito.")


In [66]:
weaviate_client.close()  # Ensure the connection is closed

---