In [1]:
import boto3
import json
from sqlalchemy import create_engine, Column, Integer, Text
from sqlalchemy.orm import sessionmaker, declarative_base
from pgvector.sqlalchemy import Vector


In [2]:
import boto3
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'
)

In [3]:
def embed_body(chunk_message: str):
    return json.dumps({
        'inputText': chunk_message,
    })

def embed_call(chunk_message: str):
    model_id = "amazon.titan-embed-text-v2:0"
    body = embed_body(chunk_message)

    response = bedrock_runtime.invoke_model(
        body=body,
        modelId=model_id,
        contentType='application/json',
        accept='application/json'
    )
    return json.loads(response['body'].read().decode('utf-8'))


In [4]:
DATABASE_URL = "postgresql://postgres:postgres72861001@sandbox-ia.ccnrq57mco3x.us-east-1.rds.amazonaws.com:5432/clau"
engine = create_engine(DATABASE_URL, connect_args={"connect_timeout": 1200})
Session = sessionmaker(bind=engine)
Base = declarative_base()

In [5]:
class Fragmented(Base):
    __tablename__ = 'fragmented'
    id = Column(Integer, primary_key=True)
    text_content = Column(Text, nullable=False)
    embedding = Column(Vector(1024), nullable=False)  

Base.metadata.create_all(engine)

In [6]:
def insert_fragment(text):
    session = Session()
    embedding = embed_call(text)['embedding']  
    fragment = Fragmented(text_content=text, embedding=embedding)
    session.add(fragment)
    session.commit()
    session.close()

In [7]:
from sqlalchemy.sql import text

def search_similar_fragments(query_text, top_k=3):
    session = Session()
    query_embedding = embed_call(query_text)['embedding']
    embedding_str = "ARRAY[" + ", ".join(map(str, query_embedding)) + "]::vector"
    query = text(f"""
        SELECT id, text_content, cosine_similarity(embedding, {embedding_str}) AS similarity
        FROM fragmented
        ORDER BY similarity DESC
        LIMIT :top_k
    """)

    results = session.execute(query, {"top_k": top_k}).fetchall()
    session.close()
    return results

In [8]:
import fitz

def extraer_texto_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texto = ""
    for pagina in doc:
        texto += pagina.get_text()  
    return texto


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def dividir_texto_con_logica(texto, chunk_size=500, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(texto)


In [10]:
import re

def limpiar_y_unir_lineas(texto):
    texto = re.sub(r'-\n', '', texto)
    texto = re.sub(r'\s*\n\s*', '\n', texto)
    texto = re.sub(r'\s+', ' ', texto)
    texto = re.sub(r'\n+', '\n', texto)
    texto = texto.strip()
    return texto


In [11]:
pdf_path = "tdr_v4.pdf"  
texto = extraer_texto_pdf(pdf_path)
fragmentos = dividir_texto_con_logica(texto, chunk_size=1000, chunk_overlap=200)
cleaned_fragments = [fragment.replace("\n", " ").strip() for fragment in fragmentos]
filtered_fragments = [frag for frag in cleaned_fragments if "........." not in frag [:100]]

for chunk in filtered_fragments:
    insert_fragment(chunk)