In [None]:
import re
import os
import base64
from loguru import logger
from core import loguru_logger
from langchain_text_splitters import MarkdownHeaderTextSplitter

loguru_logger.setup()
with open("./arxiv/a-survey-to-transformers.md", "r") as file:
    md = file.read()

[32m2025-09-22 15:36:10.256[0m | [34m[1mDEBUG   [0m | [35mPID:771781[0m | [36mcore.logging:setup:44[0m - [34m[1mLoguru logger intialized[0m


[32m2025-09-22 15:36:10.392[0m | [34m[1mDEBUG   [0m | [35mPID:771781[0m | [36m__main__:<module>:21[0m - [34m[1mProcessing chunk 2 of 27[0m
[32m2025-09-22 15:36:10.392[0m | [34m[1mDEBUG   [0m | [35mPID:771781[0m | [36m__main__:<module>:24[0m - [34m[1mFound 0 images in chunk 1, split 1[0m
[32m2025-09-22 15:36:10.393[0m | [34m[1mDEBUG   [0m | [35mPID:771781[0m | [36m__main__:<module>:27[0m - [34m[1mAssigning doc_id e8c626c8-60d4-4217-a3f3-dd8b7a891276 to split 1[0m
[32m2025-09-22 15:36:17.767[0m | [34m[1mDEBUG   [0m | [35mPID:771781[0m | [36mutils.metadata:generate_text_metadata:87[0m - [34m[1mResponse: {
    "summary": "This survey paper, authored by Tianyang Lin, Yuxin Wang, Xiangyang Liu, and Xipeng Qiu, originates from the School of Computer Science at Fudan University in China.",
    "keywords": ["Transformer", "survey", "deep learning", "artificial intelligence"],
    "entities": ["Fudan University", "China"],
    "key_objects": ["autho

### Locate Images and Save Locally

In [None]:
image_pattern = r"!\[Image\]\((data:image/[a-zA-Z]+;base64,[^)]*?)\)"
image_matches = re.findall(image_pattern, md)

image_folder = "./images/a-survey-to-transformers"

os.makedirs(image_folder, exist_ok=True)

if image_matches:
    for idx, image in enumerate(image_matches, start=1):
        file_name = f"{image_folder}/image_{idx}.png"
        with open(file_name, "wb") as img_file:
            base64_code = base64.b64decode(image.split(",")[1])
            img_file.write(base64_code)
            md = md.replace(image, f"{file_name}")

    with open(f"./updated_markdown.md", "w") as updated_md_file:
        updated_md_file.write(md)

### Split Text by Headers

In [None]:
headers_to_split_on = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
    ("####", "h4"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on,
    strip_headers=False
)

with open("./updated_markdown.md", "r") as file:
    updated_markdown_file = file.read()

md_header_splits = markdown_splitter.split_text(updated_markdown_file)

In [7]:
from uuid import uuid4
from langchain_core.documents import Document
from schemas import ImageMetadata, TextMetadata
from utils import generate_image_metadata, generate_text_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 1024
chunk_overlap = 200

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", " ", ""]
)

docs = []
failed_text_docs = []
img_pattern = r"!\[[^\]]*\]\(([^)]+\.(?:png|jpg|jpeg|gif))\)"
for chunk_idx, chunk in enumerate(md_header_splits, start=1):
    splits = text_splitter.split_text(chunk.page_content)
    logger.debug(f"Processing chunk {chunk_idx + 1} of {len(md_header_splits)}")
    for split_idx, split in enumerate(splits, start=1):
        matches = re.findall(img_pattern, split)
        logger.debug(f"Found {len(matches)} images in chunk {chunk_idx}, split {split_idx}")
        split_no_images = re.sub(img_pattern, "", split)
        doc_id = str(uuid4())
        
        try:
            text_metadata: TextMetadata = generate_text_metadata(
                chunk_text=split_no_images,
                section_context=chunk
            )
            docs.append(
                Document(
                    metadata={
                        "doc_type": "text",
                        "doc_id": doc_id,
                        "section_hierarchy": {**chunk.metadata},
                        "mentioned_images": matches,
                        **text_metadata.model_dump()
                    },
                    page_content = (
                        f"Keywords: {', '.join(text_metadata.keywords)}\n"
                        f"Key Objects: {', '.join(text_metadata.key_objects)}\n"
                        f"Refers to Images: {', '.join(matches) if matches else 'None'}\n"
                        "Hypothetical Questions:\n"
                        f"- {'\n- '.join(text_metadata.hypothetical_questions)}\n"
                        "---\n"
                        f"Summary:\n{text_metadata.summary}\n"
                        f"Original Text:\n{split_no_images}\n"
                        f"Contextualized Text:\n{text_metadata.contextual_text}"
                    )
                )
            )
        except Exception as e:
            logger.error(f"Error processing document {doc_id}: {e}")
            failed_text_docs.append(
                {
                    "doc_id": doc_id,
                    "section_context": chunk,
                    "chunk_text": split_no_images
                }
            )
            continue
        
        for match in matches:
            image_metadata: ImageMetadata = generate_image_metadata(image_path=match)
            docs.append(
                Document(
                    metadata={
                        "doc_id": str(uuid4()),
                        "source": match,
                        "parent_doc_id": doc_id,
                        "doc_type": "image",
                        **image_metadata.model_dump()
                    },
                    page_content = (
                        f"Image title: {image_metadata.title}\n"
                        f"Tags: {', '.join(image_metadata.tags)}\n"
                        f"Key objects: {', '.join(image_metadata.key_objects)}\n"
                        "---\n"
                        f"Summary:\n{image_metadata.summary}\n"
                        f"Full description:\n{image_metadata.contextual_description}\n"
                        f"Text found in image:\n- {'\n- '.join(image_metadata.text_in_image)}"
                    )
                )
            )

print(f"Total chunks created: {len(docs)}")

Total chunks created: 202


In [10]:
failed_text_docs

[{'doc_id': 'ecd4163c-ab35-4825-ac88-31d5a2a0c5ea',
  'section_context': Document(metadata={'h1': 'A Survey of Transformers', 'h2': 'REFERENCES'}, page_content="## REFERENCES  \n- [1] Joshua Ainslie, Santiago Ontanon, Chris Albert, Vaclav Cvicek, Zachary Fisher, Philip Pham, Anirudh Ravula, Sumit Sanghai, Qifan Wang, and Li Yang, 2020. ETC: Encoding Long and Structured Inputs in Transformers. In Proceedings of EMNLP. Online, 268-284. https://doi.org/10.16853/v1.2020.empIm-main.19\n- [2] Rami Al-Rfou, Dokook Choe, Noah Constant, Mandy Guo, and Llion Jones. 2019. Character-Level Language Modeling with Deeper Self-Attention. In Proceedings of AAAI: 3159-3166. https://doi.org/10.1609/aaai.v33i1.30313159\n- [3] Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu'ci'e, and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. arXiv:2103.15691 [cs.CV]\n- [4] Lei Jimmy Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. 2016. Layer Normalization. CoRR abs/1607.06450 (2016). arXiv:1

In [8]:
from langchain_postgres import PGVector
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="embeddinggemma:latest")
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/rag"
collection_name = "arxiv"

vector_store = PGVector(
    embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True
)

vector_store.add_documents(docs, ids=[doc.metadata['doc_id'] for doc in docs])

['f8fc7437-d4a2-4451-af76-3e85a1f518ff',
 '4f73b69a-54f8-4e7b-891a-26a7efcec584',
 '19bdc731-d305-40f4-af5d-5a346a51b569',
 '56da03aa-4b2f-41e0-86c7-1c6b14897f61',
 'a395f039-ef52-4e79-a966-dd6d32b27e53',
 'cd915ba5-f84e-4344-b4a9-679dbfe07d50',
 'c1b691c5-b7d2-475c-bb8f-9d50cf8800e9',
 'e82d6260-748b-419c-82c7-2fb4bbaa73fc',
 '0f6724ab-5273-4fe5-9ec9-b996cfa79f37',
 'f57867ae-0b3c-4381-b38b-d1641e9db715',
 '121fda60-1994-4388-a82d-526129b3bde4',
 '94a2f5d5-d379-4ab5-aa00-9d89bda364e3',
 '7a97f798-8aa4-452a-8aaa-862c64853369',
 '647680ad-93e2-4763-95a6-fbdaed1ece29',
 '3f6e312c-fe14-4573-800b-49a22d6e2cf6',
 '02e43c44-3cfb-4ac3-afdd-fbc4a8d146a0',
 '52385e10-625c-41a6-bdd9-285d4ab72a95',
 'a0215f88-a7d0-41d9-b764-a72263c40428',
 '2a23e58f-c063-4e5d-9ea4-50e9ca4f2875',
 '2d2a6127-47b2-4284-835d-fe49ff42317b',
 '584f9d5d-3b55-42f5-bfc7-39ac2c43e164',
 'e34e6ac8-b609-4905-b98c-397adfe988b7',
 'aa31128e-af9e-4666-947c-f92296fa4119',
 'a15ddce5-2631-44db-8950-9cc957003396',
 '58eeb957-3ec1-

In [None]:
results = vector_store.similarity_search(
    "gradient descent",
    k=10,
    filter={"doc_type": {"$eq": "text"}},
)

for index, doc in enumerate(results):
    print(f"* {index}. {doc.page_content} [{doc.metadata}]")
    print("----------------------------------------------------")