# INGESTION
Este proceso se engarga de la ingesta de un sistema RAG o generación aumentada por recuperación.




In [1]:
!pip install PyPDF2 langchain faiss-cpu -q > /dev/null 2>&1

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import os
import pickle
import json
import numpy as np
import PyPDF2
import faiss
import re

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extrae todo el texto de un archivo PDF y lo devuelve como una cadena concatenada.
    Parameters:
        pdf_path (str): Ruta completa al archivo PDF del cual se desea extraer el texto.
    Returns:
        str: Cadena de texto que contiene todo el contenido textual del PDF, con cada
             página separada por un salto de línea.
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


In [4]:
text = extract_text_from_pdf("ley_educacion.pdf")

In [5]:
def clean_legal_text(text):
    """
    Limpia texto eliminando pies de página e información de paginación específicos.
    Parameters:
        text (str): Texto legal crudo extraído de un documento PDF
    Returns:
        str: Texto limpio sin la información de pies de página y paginación
    """
    footer_pattern = r'LEY ORGÁNICA DE EDUCACIÓN SUPERIOR, LOES\s*-\s*Página\s*\d+'
    finder_pattern = r'FINDER LOYAL\s*-\s*www\.lexis\.com\.ec?'
    combined_pattern = r'(?:' + footer_pattern + r'[\s\r\n]*' + finder_pattern + r'|' + finder_pattern + r'[\s\r\n]*' + footer_pattern + r')'
    cleaned_text = re.sub(combined_pattern, '', text, flags=re.IGNORECASE)

    return cleaned_text

In [6]:
cleanText = clean_legal_text(text)

In [7]:
def separate_articles(text):
  """
  Separa un texto legal en artículos individuales, excluyendo disposiciones generales,
  transitorias y finales.
  Parametro:
      text (str): Texto legal completo que contiene artículos numerados
  Return:
      list: Lista de strings donde cada elemento es un artículo completo con su contenido
  """
  secciones = re.split(r'(?=DISPOSICIONES GENERALES\n|DISPOSICIONES TRANSITORIAS\n|DISPOSICIONES FINALES\n)',
                  text, flags=re.IGNORECASE)
  partes = re.split(r'(Art\. \d+(?:\.\d+)*\.-)', secciones[0])
  articulos = []
  for i in range(1, len(partes), 2):
      if i + 1 < len(partes):
          articulo_completo = partes[i] + partes[i+1]
          articulos.append(articulo_completo.strip())
  return articulos

In [8]:
def chunk_legal_text(text, chunk_size=1024, chunk_overlap=150):
    """
    Crea chunks, preservando la estructura de artículos.
    Parametros:
        text (str): Texto legal completo a dividir
        chunk_size (int): Tamaño máximo de cada fragmento en caracteres
        chunk_overlap (int): Solapamiento entre fragmentos consecutivos en caracteres
    Returns:
        list: Lista de fragmentos de texto, donde cada artículo se mantiene intacto si es pequeño
              o se divide en chunks superpuestos si es demasiado grande
    """
    articulos = separate_articles(text)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    chunks_finales = []
    for articulo in articulos:
        if len(articulo) > chunk_size:
            # Divide el artículo en chunks pequeños
            chunks = splitter.split_text(articulo)
            chunks_finales.extend(chunks)
        else:
            chunks_finales.append(articulo)
    return chunks_finales

In [21]:
print("Dividiendo en chunks...")
chunk_size = 1024
chunk_overlap=150
chunks = chunk_legal_text(cleanText, chunk_size, chunk_overlap)

Dividiendo en chunks...


In [10]:
# Mostrar resultados
for i, articulo in enumerate(chunks[:5], 1):
    print(f"=== CHUNK {i} ===")
    print(articulo)
    print("\n" + "="*60 + "\n")

=== CHUNK 1 ===
Art. 1.-Ámbito.-Esta Ley regula el sistema de educación superior en el país, a los organismos e
instituciones que lo integran; determina derechos, deberes y obligaciones de las personas naturales
y jurídicas, y establece las respectivas sanciones por el incumplimiento de las disposiciones
contenidas en la Constitución y la presente Ley.


=== CHUNK 2 ===
Art. 2.-Objeto.-Esta Ley tiene como objeto definir sus principios, garantizar el derecho a la educación
superior de calidad que propenda a la excelencia interculturalidad, al acceso universal, permanencia,
movilidad y egreso sin discriminación alguna y con gratuidad en el ámbito público hasta el tercer
nivel.
Nota: Artículo reformado por artículo 2 de Ley No. 0, publicada en Registro Oficial Suplemento 297
de 2 de Agosto del 2018 .
Concordancias:
CONSTITUCIÓN DE LA REPÚBLICA DEL ECUADOR, Arts. 11, 346
CAPÍTULO 2
FINES DE LA EDUCACIÓN SUPERIOR


=== CHUNK 3 ===
Art. 3.-Fines de la Educación Superior.-La educación superio

In [11]:

#Función para generar embeddings con Jina V3
def generate_embeddings(chunks):
    """
    Genera embeddings vectoriales usando el modelo Jina Embeddings V3.
    Parámetros:
        chunks (list): Lista de chunks a convertir en embeddings
    Retorna:
        tuple: Tupla que contiene:
            - embeddings: Array con los vectores de embeddings generados
            - model: Instancia del modelo SentenceTransformer utilizado
    """
    model = SentenceTransformer(
        "jinaai/jina-embeddings-v3",
        trust_remote_code=True
    )

    embeddings = model.encode(
        chunks,
        task="retrieval.passage",  # Para chunks de documentos
        show_progress_bar=True,
        batch_size=32
    )
    return embeddings, model

In [12]:
# Generar embeddings
print("Generando embeddings...")
embeddings, model = generate_embeddings(chunks)

Generando embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

custom_st.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v3:
- custom_st.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json: 0.00B [00:00, ?B/s]

configuration_xlm_roberta.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py: 0.00B [00:00, ?B/s]

modeling_xlm_roberta.py: 0.00B [00:00, ?B/s]

mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


rotary.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py: 0.00B [00:00, ?B/s]

stochastic_depth.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- mlp.py
- embedding.py
- xlm_padding.py
- rotary.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downl

model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [13]:
# 5. Creación del índice FAISS
def create_faiss_index(embeddings):
    """
    Crea un índice FAISS para almacenar y buscar embeddings de manera eficiente.
    Parámetros:
        embeddings: Array numpy con los vectores de embeddings a indexar
    Retorna:
        index: Índice FAISS configurado con los embeddings proporcionados
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype('float32'))
    return index

In [14]:
print("Creando índice FAISS...")
index = create_faiss_index(embeddings)

Creando índice FAISS...


In [43]:

# Modifica la función de guardado para almacenamiento local
def save_locally(chunks, embeddings, index, model, base_path="."):
    """Guarda la base de datos RAG localmente"""
    os.makedirs(base_path, exist_ok=True)
    # Guardar chunks
    with open(f"{base_path}/chunks.pkl", "wb") as f:
        pickle.dump(chunks, f)

    # Guardar embeddings
    np.save(f"{base_path}/embeddings.npy", embeddings)

    faiss.write_index(index, f"{base_path}/faiss_index.bin")

    metadata = {
        "total_chunks": len(chunks),
        "embedding_dimension": embeddings.shape[1],
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "model_name": "jinaai/jina-embeddings-v3"
    }

    with open(f"{base_path}/metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False)
    print(f"Base de datos guardada en: {base_path}/")

# Función para cargar localmente
def load_local(base_path="rag_database"):
    """Carga la base de datos RAG desde almacenamiento local"""

    with open(f"{base_path}/chunks.pkl", "rb") as f:
        chunks = pickle.load(f)

    embeddings = np.load(f"{base_path}/embeddings.npy")

    index = faiss.read_index(f"{base_path}/faiss_index.bin")

    with open(f"{base_path}/metadata.json", "r", encoding="utf-8") as f:
        metadata = json.load(f)
    model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

    return chunks, embeddings, index, model, metadata


In [44]:
file_path = "/content"

In [45]:
print("Guardando localmente...")
save_locally(chunks, embeddings, index, file_path)

Guardando localmente...
Base de datos guardada en: ./


In [46]:
#CARGAR DATOS
chunks, embeddings, index, model, metadata = load_local(file_path)

