In [10]:
from IPython.display import IFrame
from docling.document_converter import DocumentConverter
import boto3
import os
from sdg_hub.core.flow import FlowRegistry
from sdg_hub.core.blocks import BlockRegistry
import pypdfium2 as pdfium
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import LanceDB
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.graph_vectorstores import GraphVectorStoreRetriever
from langchain_core.documents import Document
from lancedb.rerankers import LinearCombinationReranker
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
import lancedb
from huggingface_hub import snapshot_download
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, SentenceTransformerEmbeddings
from transformers import AutoTokenizer
from enum import Enum
import traceback
import re

In [None]:
endpoint_url = os.getenv('AWS_S3_ENDPOINT')
access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
config = boto3.session.Config(signature_version='s3v4')
bucket = os.getenv("AWS_S3_BUCKET")
source_path = 'pdf/'
target_path = 'pdf'
target_path_chapters = 'pdf_chunked'
target_path_markdown = 'markdown'
CODE_LANGUAGE='ColdFusion'

embedding_model = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", 
    model_kwargs={"trust_remote_code":True
})

llm = ChatOpenAI(
    model="openai/gpt-oss-20b", # os.getenv('QWEN25CODER_MODEL_ID'),
    api_key=os.getenv('OPENROUTER_TOKEN'),
    base_url=os.getenv('OPENROUTER_API_BASE'),
    temperature=0.1,
)

vectorstore_connection = lancedb.connect(f"s3://data/lancedb-graphrag",
    storage_options={
        "endpoint_url": endpoint_url,
        "aws_access_key_id": access_key_id,
        "aws_secret_access_key": secret_access_key,
        "s3_force_path_style": "true",
        "allow_http": "true",
    }
)

vectorstore = LanceDB(
    mode="append",
    embedding=embedding_model,
    connection=vectorstore_connection,
)

minio = boto3.client(
    's3',
    endpoint_url=endpoint_url,
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    config=boto3.session.Config(signature_version='s3v4')
)

In [None]:
try:
    os.makedirs(target_path, exist_ok=True)
    os.makedirs(target_path_chapters, exist_ok=True)
    files = minio.list_objects_v2(Bucket=bucket, Prefix=source_path)
    if 'Contents' in files:
        for obj in files['Contents']:
            file = obj['Key']
            minio.download_file(bucket, file, f"{target_path}/{file.split('/')[-1]}")
            print(f"File '{source_path}' downloaded successfully to {target_path}/{file.split('/')[-1]}")
except Exception as e:
    print(f"Error downloading file: {e}")

In [None]:
def get_chapter_ranges(sourcefilename, do_print=True):
    """
    Returns a list of (beginPage, endPage) ranges for chunks that represent chapters in the given pdf.
    """
    print("Getting chapter ranges...\n")
    
    pdf = pdfium.PdfDocument(sourcefilename)
    ranges = []
    begin, end = None, None
    
    for item in pdf.get_toc():
        state = "*" if item.n_kids == 0 else "-" if item.is_closed else "+"
        target = "?" if item.page_index is None else item.page_index+1
        boundary = None
        
        if item.page_index and ((item.n_kids == 0 and item.level < 2) or item.level == 2):
            if begin is not None:
                end = item.page_index - 1
                boundary = [begin, max(begin, end)]
                ranges.append(boundary)
            begin = item.page_index
            
        if do_print:
            if boundary:
                print("    " * 2 +  f"(Pages {(boundary[0]+1)} - {(boundary[1]+1)})" + "\n")
            print(("    " * item.level) + f"[{state}] {item.title} -> {target}  # {item.view_mode} {item.view_pos}")
    return ranges

In [None]:
def split_chapters(sourcefilename, targetfilename, pagerange):
    """
    Splits the pdf into chapters using the provided page ranges.
    Returns the name of the new pdf chunk.
    """
    try:
        source_pdf = pdfium.PdfDocument(sourcefilename)
        new_pdf = pdfium.PdfDocument.new()
    
        print(f"Retrieving chapter...{targetfilename}, Pages {pagerange[0]} to {pagerange[1]}")
        new_page_index = new_pdf.import_pages(source_pdf, pages=list(range(pagerange[0], pagerange[1]+1)))
        new_pdf.save(targetfilename)
        
        source_pdf.close()
        new_pdf.close()
        
    except Exception as e:
        print(f"Error saving {targetfilename}: {e}")

In [None]:
def convert_to_markdown(pdffile, markdownfile):
    """
    Converts the pdf into a markdown file.
    """
    try:
        print(f"Converting {pdffile} to markdown...")
        
        converter = DocumentConverter()
        
        result = converter.convert(pdffile)
        
        markdown_output = result.document.export_to_markdown()

        with open(markdownfile, "w") as file:
            file.write(markdown_output)

        print(f"{markdownfile} generated.")
        
    except Exception as e:
        print(f"Error saving {markdownfile}: {e}")
    

In [None]:
class CodeType(Enum):
    """
    Types of entity relationships
    """
    MARKDOWN_SECTION = "markdown-section"
    CODE_TO_MARKDOWN = "markdown"
    CODE_TO_TOPICS = "topics"
    CODE_TO_KEYWORDS = "keywords"
    CODE_TO_TAGS = "tags"
    CODE_TO_FUNCTIONS = "functions"
    CODE_TO_STRUCTURE = "structure"
    CODE_TO_SUMMARY = "summary"

In [None]:
def strip_code_section(content):
    """
    Strips out code sections of file.
    """
    code_sections = re.findall(r'```([^`]+)```', content, re.DOTALL)
    return code_sections

def extract_keywords(content):
    """
    Extract keywords from the provided content.
    """
    try:
        code_task = f"""
        analyze this text and provide 10 one-word keywords that are connected to the text
        """
        code_instructions = """
            1.  **Format your response clearly and concisely** as a comma-delimited list.
        """
        keywords = text_generation_tool([content], CODE_LANGUAGE, code_task, code_instructions)
        keywords = keywords[0].content.strip()
        keywords = [k.strip() for k in keywords.split(',')]
        return keywords
    except Exception as e:
        print(f"Error while extracting keywords: {e}")
        traceback.print_exc() 
        return []

def text_generation_tool(code_sections, code_language, code_task, code_instructions):
    system_template = """
    You are an expert software engineer with extensive experience in {code_language}.
    Your task is to {code_task}.
    **Instructions:**
    {code_instructions}
    **Text to analyze:**
    """
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_prompt,
            HumanMessagePromptTemplate.from_template("{input}"),
        ]
    )
    inputs = [{"input": section, 
               "code_language": code_language,
               "code_task": code_task,
               "code_instructions": code_instructions} 
              for section in code_sections]
    chain = prompt | llm
    responses = chain.batch(inputs)
    return responses
    
"""
1. Markdown sections
"""
def build_markdown_section(file):
    """
    Generates markdown section chunks from the file and stores them in the vector database.
    """
    try:
        print(f"Parsing markdown {file}...")
        filecontent = None
        with open(file, mode="r") as f: 
            filecontent = f.read()
        headers_to_split = [("#", "Header 1"), ("##", "Header 2"),("###", "Header 3")]
        text_splitter = MarkdownHeaderTextSplitter(headers_to_split, strip_headers=False)
        splits = text_splitter.split_text(filecontent)
        
        if strip_code_section(filecontent):
            docs = [Document(page_content=s.page_content, 
                             metadata= {"source": file,
                                       "parent": os.urandom(36).hex()} | s.metadata | {"keywords": extract_keywords(s.page_content)} )
                    for s in splits]
            # print(docs)
            print(f"Saving {len(docs)} docs...")
            vectorstore.add_documents(documents=docs)
            print(f"{file} markdown saved to db, starting code-to-text mappings...")
    
            saved_docs = []
            for doc in docs:
                sections = strip_code_section(doc.page_content)

                if sections:
                    # Generate code-to-markdown chunks
                    saved_docs += build_code_to_markdown(sections, doc) or []
            
                    # Generate code-to-concept chunks
                    saved_docs += build_code_to_topics(sections, doc) or []
            
                    # Generate code-to-tags chunks
                    saved_docs += build_code_to_tags(sections, doc) or []
            
                    # Generate code-to-functions chunks
                    saved_docs += build_code_to_functions(sections, doc) or []
            
                    # Generate code-to-structure chunks
                    saved_docs += build_code_to_structure(sections, doc) or []
            
                    # Generate code-to-summary chunks
                    saved_docs += build_code_to_summary(sections, doc) or []
    
                    print(f"Number of code-to-text mappings generated: {len(saved_docs)}....")
                    vectorstore.add_documents(documents=saved_docs)
                    print("Code-to-text mappings saved.")
            print(f"Code to text mappings completed for {file}.")
    except Exception as e:
        print(f"Error saving {file}: {e}")
        traceback.print_exc() 

"""
2. Code-to-markdown sections
"""
def build_code_to_markdown(sections, doc):
    """
    Generates code-to-markdown mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    content=[f"***Markdown***:\n{doc.page_content}" for section in sections]
    docs = [Document(page_content=c, 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(c)})) 
            for c in content]
    return docs

"""
3. Code-to-topics
"""
def build_code_to_topics(sections, doc):
    """
    Generates code-to-topic mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    try:
        code_task = f"""
        analyze this code and generate an outline of general {CODE_LANGUAGE} topics that are connected to the code
        """
        code_instructions = """
            1.  **Provide a list of the topics that you find.**
            2.  **Format your response clearly and concisely** using a numbered list.
        """
        responses = text_generation_tool(sections, CODE_LANGUAGE, code_task, code_instructions)
        docs = [Document(page_content=f"***Topics***:\n{r.content}", 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(r)})) 
                for i, r in enumerate(responses)]
        return docs
    except Exception as e:
        print(f"Error in CODE_TO_TOPICS: {e}")
        traceback.print_exc() 
        

"""
4. Code-to-tags
"""
def build_code_to_tags(sections, doc):
    """
    Generates code-to-Coldfusion tag mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    try:
        code_task = f"""
        analyze this code and generate an outline of components that you can find in the code
        """
        code_instructions = """
            1.  **Analyze the code for:** Various ColdFusion tags, HTML / CSS elements, and other similar code elements.
            2.  **Provide a detailed explanation of your findings.**
            3.  **Format your response clearly and concisely** using a numbered list.
        """
        responses = text_generation_tool(sections, CODE_LANGUAGE, code_task, code_instructions)
        docs = [Document(page_content=f"***Components***:\n{r.content}", 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(r)})) 
                for i, r in enumerate(responses)]
        return docs
    except Exception as e:
        print(f"Error in code_to_tags: {e}")
        traceback.print_exc() 

"""
5. Code-to-functions
"""
def build_code_to_functions(sections, doc):
    """
    Generates code-to-function mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    try:
        code_task = f"""
        analyze this code and generate an outline of functions that you can find in the code
        """
        code_instructions = """
            1.  **Analyze the code for:** Any functions that you can locate in the code.
            2.  **Provide a detailed explanation of your findings.**
            3.  **Format your response clearly and concisely** using a numbered list.
        """
        responses = text_generation_tool(sections, CODE_LANGUAGE, code_task, code_instructions)
        docs = [Document(page_content=f"***Functions***:\n{r.content}", 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(r)})) 
                for i, r in enumerate(responses)]
        return docs
    except Exception as e:
        print(f"Error in code_to_functions: {e}")
        traceback.print_exc() 

"""
6. Code-to-structure
"""
def build_code_to_structure(sections, doc):
    """
    Generates code-to-structure mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    try:
        code_task = f"""
        describe the general structure of this code
        """
        code_instructions = """
            1.  **Provide a detailed explanation of your findings.**
            2.  **Format your response clearly and concisely** using bullet points.
        """
        responses = text_generation_tool(sections, CODE_LANGUAGE, code_task, code_instructions)
        docs = [Document(page_content=f"Structure***:\n{r.content}", 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(r)})) 
                for i, r in enumerate(responses)]
        return docs
    except Exception as e:
        print(f"Error in code_to_structure: {e}")
        traceback.print_exc() 
        
"""
7. Code-to-summary
"""
def build_code_to_summary(sections, doc):
    """
    Generates code-to-summary mappings from the list of code sections using a parent-child hierarchy 
    and stores them in the vector database.
    """
    try:
        code_task = f"""
        provide a summary of this code
        """
        code_instructions = """
            1.  **Provide a concise summary, including the potential business purpose and use cases for the code.**
            2.  **Format your response clearly and concisely** using bullet points.
        """
        responses = text_generation_tool(sections, CODE_LANGUAGE, code_task, code_instructions)
        docs = [Document(page_content=f"***Summary***:\n{r.content}", 
                         metadata=({"parent": doc.id} | doc.metadata | {"keywords": extract_keywords(r)})) 
                for i, r in enumerate(responses)]
        return docs
    except Exception as e:
        print(f"Error in code_to_summary: {e}")
        traceback.print_exc() 
        

In [None]:
files = [f for f in os.listdir(target_path) if ".pdf" in f]
for file in files:
    ranges = get_chapter_ranges(f"{target_path}/{file}", do_print=False)
    for idx, _range in enumerate(ranges):
        pdf = f"{target_path_chapters}/{idx}_{file}"
        md = f"{target_path_markdown}/{idx}_{file.replace('.pdf', '.md')}"
        split_chapters(f"{target_path}/{file}", pdf, _range)
        # convert_to_markdown(pdf, md)
        build_markdown_section(md)

In [None]:
# table = vectorstore_connection.open_table('vectorstore')
# table_schema = table.schema
# print(f"Schema for table '{table.name}':")
# print("-" * 30)
# for field in table_schema:
#     print(f" - Column: '{field.name}'")
#     print(f"   Type: {field.type}")
#     print(f"   Nullable: {field.nullable}")

# print(f"\nFull PyArrow Schema:\n{table_schema}")
