In [1]:
import boto3
import json
from sqlalchemy import create_engine, Column, Integer, Text
from sqlalchemy.orm import sessionmaker, declarative_base
from pgvector.sqlalchemy import Vector


In [2]:
import boto3
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'
)

In [3]:
def embed_body(chunk_message: str):
    return json.dumps({
        'inputText': chunk_message,
    })

def embed_call(chunk_message: str):
    model_id = "amazon.titan-embed-text-v2:0"
    body = embed_body(chunk_message)

    response = bedrock_runtime.invoke_model(
        body=body,
        modelId=model_id,
        contentType='application/json',
        accept='application/json'
    )
    return json.loads(response['body'].read().decode('utf-8'))


In [4]:
DATABASE_URL = "postgresql://postgres:postgres72861001@sandbox-ia.ccnrq57mco3x.us-east-1.rds.amazonaws.com:5432/clau"
engine = create_engine(DATABASE_URL, connect_args={"connect_timeout": 1200})
Session = sessionmaker(bind=engine)
Base = declarative_base()

In [5]:
class Fragmented(Base):
    __tablename__ = 'fragmented'
    id = Column(Integer, primary_key=True)
    text_content = Column(Text, nullable=False)
    embedding = Column(Vector(1024), nullable=False)  

Base.metadata.create_all(engine)

In [6]:
def insert_fragment(text):
    session = Session()
    embedding = embed_call(text)['embedding']  
    fragment = Fragmented(text_content=text, embedding=embedding)
    session.add(fragment)
    session.commit()
    session.close()

In [7]:
def search_similar_fragments(query_text, top_k=3):
    session = Session()
    query_embedding = embed_call(query_text)['embedding']
    results = (
        session.query(Fragmented)
        .order_by(Fragmented.embedding.l2_distance(query_embedding))
        .limit(top_k)
        .all()
    )
    session.close()
    return results

In [8]:
import fitz

def extraer_texto_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texto = ""
    for pagina in doc:
        texto += pagina.get_text()  
    return texto


In [9]:
def chunkear_texto(texto, tamano=300):
    palabras = texto.split()  
    chunks = [] 
    lista = [] 

    for i, palabra in enumerate(palabras, start=1):
        lista.append(palabra)
        if i % tamano == 0: 
            chunks.append(" ".join(lista))  
            lista = []  

    if lista:
        chunks.append(" ".join(lista))
    
    return chunks


In [20]:
def insertar_chunks_pdf(pdf_path):
    texto = extraer_texto_pdf(pdf_path)
    chunks = chunkear_texto(texto, tamano=300)
    print(len(chunks))
    for chunk in chunks:
        insert_fragment(chunk)  


In [21]:
if __name__ == "__main__":
    pdf_path = "tdr_v4.pdf"
    insertar_chunks_pdf(pdf_path)



66
