In [1]:
##############################################################################
# Handle imports
##############################################################################
from docling.document_converter import DocumentConverter
import traceback
from collections.abc import Iterable
import os
import pypdfium2 as pdfium
import re
import json
import uuid
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import assert_test
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval, AnswerRelevancyMetric
from deepeval import evaluate
import xml.etree.ElementTree as etree
from longdocfactscore.ldfacts import LongDocFACTScore
from datetime import datetime
import nltk
nltk.download('punkt_tab')
from dotenv import load_dotenv
load_dotenv()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
##############################################################################
# Set up variables
##############################################################################
SOURCE_DIR="source_docs"
SOURCE_DIR_CHUNKED="source_docs_chunked"
MARKDOWN_DIR="markdown"
MARKDOWN_URI_PREFIX="https://raw.githubusercontent.com/agapebondservant/code-generation-capstone/refs/heads/main/eda/resources"
REPORT_DIR="reports"
OUTPUT_DIR="output"
INVALID_DIR="invalid"
ERROR_DIR="error" 

In [3]:
##############################################################################
# Set up object instances
##############################################################################

data_generator_llm = ChatOpenAI(
    model=os.getenv("DATA_GENERATOR_MODEL_ID"), # os.getenv('QWEN25CODER_MODEL_ID'),
    api_key=os.getenv('OPENROUTER_TOKEN'),
    base_url=os.getenv('OPENROUTER_API_BASE'),
    temperature=0.1,
)

class DataGeneratorLLM(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Data Generator LLM (GPT-OSS)"

evaluator_llm = DataGeneratorLLM(data_generator_llm)

### Data Conversion
The following pipeline is executed:
- A set of pdf files is collected from the source directory.
- The pdf files are split up into individual chapters.
- Each chapter is converted into markdown using a smart OCR tool (<a href="https://github.com/docling-project/docling" target="_blank">Docling</a>).

In [4]:
##############################################################################
# DATA EXTRACTION
##############################################################################
def get_chapter_ranges(sourcefilename, do_print=True):
    """
    Returns a list of (beginPage, endPage) ranges for chunks that represent chapters in the given pdf.
    """
    print("Getting chapter ranges...\n")
    
    pdf = pdfium.PdfDocument(sourcefilename)
    
    chapters = []
    
    begin, end = None, None
    
    for item in pdf.get_toc():
        boundary = None
        
        if item.page_index and ((item.n_kids == 0 and item.level < 2) or item.level == 2):
            if begin is not None:
                
                end = item.page_index - 1
                
                boundary = [begin, max(begin, end)]
                
                chapters.append([item.title, boundary])
                
            begin = item.page_index
            
    return chapters

In [5]:
def split_chapters(sourcefilename, targetfilename, pagerange):
    """
    Splits the pdf into chapters using the provided page ranges.
    Returns the name of the new pdf chunk.
    """
    try:
        source_pdf = pdfium.PdfDocument(sourcefilename)
        
        new_pdf = pdfium.PdfDocument.new()
    
        print(f"Saving chapter...{targetfilename}, Pages {pagerange[0]} to {pagerange[1]}")
        
        new_page_index = new_pdf.import_pages(source_pdf, pages=list(range(pagerange[0], pagerange[1]+1)))
        
        new_pdf.save(targetfilename)
        
        source_pdf.close()
        
        new_pdf.close()
        
    except Exception as e:
        print(f"Error saving {targetfilename}: {e}")

In [6]:
def convert_to_markdown(pdffile, markdownfile):
    """
    Converts the given PDF file into a Markdown file.
    """
    try:
        converter = DocumentConverter()
        
        result = converter.convert(pdffile)
        
        markdown_output = result.document.export_to_markdown()
    
        with open(markdownfile, "w") as file:
            file.write(markdown_output)
    
        print(f"{markdownfile} generated.")

        return markdown_output
            
    except Exception as e:
        print(f"Error saving {markdownfile}: {e}")

In [7]:
def get_code_snippets(content):
    """
    Parses out code sections of the markdown file.
    """
    
    code_snippets = re.findall(r'```([^`]+)```', content, re.DOTALL)

    return code_snippets

In [8]:
def generate_report(data, reportname, header=None):
    """
    Writes the given data to a report file with the given name.
    """
    try:
        with open(reportname, "a") as file:

            if header:
                file.write(header + "\n")
            
            if isinstance(data, Iterable):
                
                file.write("\n".join([str(d) for d in data]))
            else:
                
                file.write(data)
                
    except Exception as e:
        print(f"Error generating report {reportname}: {e}")

### Generate code-text pairs
The following code-text pairs will be generated:
- Code-to-markdown
- Code-to-requirements
- Code-to-topics
- Code-to-components (JavaBeans, Controllers, Views, Custom Tags)
- Code-to-domain
- Code-to-summary

In [9]:
###################################
# Prompts
###################################
SYSTEM_PROMPT = """
    You are an expert software engineer with extensive experience in developing JSP applications.
    {code_instructions}
    
    Code to analyze:
    """

CODE_TO_REQUIREMENTS_INSTRUCTIONS_PROMPT = """
    Your task is to analyze this code snippet and generate an outline of functional requirements that might be connected to the code.
    
    Instructions:
    1. **Provide a short list of relevant requirements.** Do not include requirements that are not related to the code.
    2. **Format your response clearly and concisely** using a numbered list.
    3. If the provided snippet does not appear to be a code snippet, say THIS IS NOT CODE.
"""

CODE_TO_TOPICS_INSTRUCTIONS_PROMPT = """
    Your task is to analyze this code snippet and generate a list of general programming topics that are related to the code.
    
    Instructions:
    1. **Provide a short list of topics that you can identify.**
    2. **Format your response clearly and concisely** using a numbered list.
    3. If the provided snippet does not appear to be a code snippet, say THIS IS NOT CODE.
    
"""

CODE_TO_COMPONENTS_INSTRUCTIONS_PROMPT = """
    Your task is to analyze this code snippet and generate an outline of all the components you can find, 
    such as Model Components or JavaBeans, Controllers, Views, JSTL tags, Scriplets, etc.
    
    Instructions:
    1. **Provide an overview of all the components that you can find.**
    2. **Format your response clearly and concisely** using a numbered list.
    3. If the provided snippet does not appear to be a code snippet, say THIS IS NOT CODE.
"""

CODE_TO_KEYWORDS_INSTRUCTIONS_PROMPT = """
    Your task is to analyze this code snippet and generate a list of keywords that are associated with the code.
    
    Instructions:
    1. **Provide a short list of one-word keywords.**
    2. **Format your response clearly and concisely** using a comma-delimited list.
    3. If the provided snippet does not appear to be a code snippet, say THIS IS NOT CODE.
"""

CODE_TO_SUMMARY_INSTRUCTIONS_PROMPT = """
    Your task is to analyze this code snippet and provide a summary of the code.
    
    Instructions:
    1.  **Provide a concise summary, including the potential business purpose and use cases for the code.**
    2.  **Format your response clearly and concisely** using a numbered list.
    3. If the provided snippet does not appear to be a code snippet, say THIS IS NOT CODE.
"""

In [10]:
###################################
# Validation and Evaluation functions
###################################
def get_validation_issues(item):
    """
    Returns a list of validation issues associated with this item,
    or an empty list if no validation issues were found.
    """
    issues = []
    
    #########################
    # 1. Check if valid json
    #########################
    try:
        
        json.dumps(item)
        
    except Exception as e:
        
        issues.append(f"{str(item)},Invalid JSON")

    #########################
    # 2. Check missing values
    #########################
    if None in item.values():

        issues.append(f"{json.dumps(item)},Missing Values")

    #########################
    # 3. Check valid JSP code
    #########################
    if "THIS IS NOT CODE" in item.values():

        issues.append(f"{json.dumps(item)},Invalid Code")

    return issues
        

def evaluate_dataset_entry(item):
    """
    Evaluates the given item with automated metrics.
    The following metrics are evaluated:
    1. For Coherence: G-Eval (LLM-as-Judge)
    2. For Relevancy: G-Eval (LLM-as-Judge)
    """

    evaluations = {
        "coherence": {},
        "relevancy": {},
    }

    ##################################################
    # 1. Measuring Coherence metric with G-eval
    ##################################################

    input_prompts = [ 
        ("requirements", CODE_TO_REQUIREMENTS_INSTRUCTIONS_PROMPT),
        
        ("topics", CODE_TO_TOPICS_INSTRUCTIONS_PROMPT),
        
        ("components", CODE_TO_COMPONENTS_INSTRUCTIONS_PROMPT),
        
        ("keywords", CODE_TO_KEYWORDS_INSTRUCTIONS_PROMPT),
        
        ("summary", CODE_TO_SUMMARY_INSTRUCTIONS_PROMPT),
    ]

    for code_type, prompt in input_prompts:
    
        test_case = LLMTestCase(
            
            input=f"{prompt}\nCode:{str(item["code"])}",
            
            actual_output=f"{code_type.capitalize()}:{item[code_type]}",
        ) 
    
        coherence_metric = GEval(
            name="Coherence",
            
            criteria="Determine if the actual output flows logically from the given input.",
            
            evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
            
            threshold=0.5,
            
            strict_mode=False,
            
            verbose_mode=False, 
            
            model=evaluator_llm
        )
    
        relevancy_metric = AnswerRelevancyMetric(
            include_reason=True,
            
            model=evaluator_llm,
            
            threshold=0.5,
            
            strict_mode=False,
    
            verbose_mode=False, 
        )
        
        coherence_metric.measure(test_case)
        
        evaluations["coherence"][code_type] = {"score": coherence_metric.score, "reason": coherence_metric.reason}

        relevancy_metric.measure(test_case)
        
        evaluations["relevancy"][code_type] = {"score": relevancy_metric.score, "reason": relevancy_metric.reason}

    return evaluations

In [11]:
###################################
# Utility functions
###################################

code_types = [
    "requirements",
    "topics",
    "components",
    "keywords",
    "summary",
]

def llm_tool(inputs):
    """
    Invokes LLM with given input.
    """
    
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        SYSTEM_PROMPT
    )
    
    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_prompt,
            HumanMessagePromptTemplate.from_template("{input}"),
        ]
    )
    
    chain = prompt | data_generator_llm
         
    responses = chain.batch(inputs)
    
    return responses

def build_code_completion_pair(snippet, doc, metadata={}, outputfilename="data"):
    """
    Generates code-completion pairs from the given doc.
    """

    try:
        item = {
            "id": str(uuid.uuid4()),
            
            "metadata": metadata,
            
            "code": snippet,
            
            "section": doc.page_content,
        }
    
        inputs = [
            {
                "input": snippet, 
                 "code_instructions": CODE_TO_REQUIREMENTS_INSTRUCTIONS_PROMPT,
            },
            {
                "input": snippet, 
                 "code_instructions": CODE_TO_TOPICS_INSTRUCTIONS_PROMPT,
            },
            {
                "input": snippet, 
                 "code_instructions": CODE_TO_COMPONENTS_INSTRUCTIONS_PROMPT,
            },
            {
                "input": snippet, 
                 "code_instructions": CODE_TO_KEYWORDS_INSTRUCTIONS_PROMPT,
            },
            {
                "input": snippet, 
                 "code_instructions": CODE_TO_SUMMARY_INSTRUCTIONS_PROMPT,
            }
        ]
         
        responses = llm_tool(inputs)

        for idx, response in enumerate(responses):

            code_type = code_types[idx]
            
            item[code_type] = response.content

        validation_issues = get_validation_issues(item)

        if validation_issues:

            with open(f"{INVALID_DIR}/{outputfilename}.txt", 'a') as f:
                
                f.writelines(validation_issues)
        
        else:

            item["metadata"]["evaluations"] = evaluate_dataset_entry(item)

            with open(f"{OUTPUT_DIR}/{outputfilename}.jsonl", 'a') as f:
            
                json_line = json.dumps(item)
            
                f.write(json_line + '\n')     
        
    except Exception as e:
        
        print(f"Error writing snippet {snippet}: {e}")
        
        traceback.print_exc() 

        with open(f"{ERROR_DIR}/{outputfilename}.txt", 'a') as f:
            
            f.write(snippet + ',' + e + '\n')


In [12]:
def split_markdown_sections(markdownfilename, outputfilename, content=None):

    try:
    
        if content is None:
            
            with open(markdownfilename, mode="r") as f: 
                content = f.read()
                
        headers_to_split = [("#", "Header 1"), ("##", "Header 2"),("###", "Header 3")]
        
        text_splitter = MarkdownHeaderTextSplitter(headers_to_split, strip_headers=False)
        
        splits = text_splitter.split_text(content)
            
        for i, split in enumerate(splits):
            
            sections = get_code_snippets(split.page_content)
        
            if sections:

                print(f"Processing split {i} in {markdownfilename}...")

                for section in sections:
        
                    build_code_completion_pair(section, 
                                               
                                               split, 
                                               
                                               metadata={"source": f"{MARKDOWN_URI_PREFIX}/{markdownfilename}"} | split.metadata, 
                                               
                                               outputfilename=outputfilename)
            else:
                
                print(f"Skipping split {i} in {markdownfilename} - no code snippets found...")
                
    except Exception as e:
        
        print(f"Error handling markdown section {markdownfilename}: {e}")
        
        traceback.print_exc() 
    

### Run the pipeline
Execute the pipeline!

In [13]:
def data_extraction_pipeline():
    """
    Executes the full data extraction pipeline.
    """
    [os.makedirs(dirname, exist_ok=True) for dirname in [
        SOURCE_DIR, 
        SOURCE_DIR_CHUNKED, 
        MARKDOWN_DIR, 
        REPORT_DIR, 
        OUTPUT_DIR,
        INVALID_DIR,
        ERROR_DIR
    ]]
    
    source_files = [f for f in os.listdir(SOURCE_DIR) if ".pdf" in f]

    output_file=f"data{datetime.now().strftime('%Y%m%d%H%M')}"
    
    for file in source_files:

        try:
        
            chapters = get_chapter_ranges(f"{SOURCE_DIR}/{file}", do_print=False)
    
            generate_report(chapters, f"{REPORT_DIR}/chapters.txt", "Title,Page Range")
            
            for idx, [title, _range] in enumerate(chapters):
                
                pdf = f"{SOURCE_DIR_CHUNKED}/{idx}_{file}"
                
                md = f"{MARKDOWN_DIR}/{idx}_{file.replace('.pdf', '.md')}"
                
                # split_chapters(f"{SOURCE_DIR}/{file}", pdf, _range)
                
                # content = convert_to_markdown(pdf, md)
    
                split_markdown_sections(md, output_file)

        except Exception as e:
        
            print(f"Error handling {SOURCE_DIR}/{file}: {e}")

            

In [13]:
data_extraction_pipeline()

2025-10-28 11:22:46,768 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:22:47,384 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:47,886 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:22:50,873 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:51,811 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:53,022 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:22:53,633 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:54,090 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:22:54,723 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:56,852 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:58,067 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Processing split 5 in markdown/86_vdoc.pub_core-servlets-and-javaserver-pages.md...


2025-10-28 11:22:58,593 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:58,631 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:58,670 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:58,734 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:22:58,751 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:23:00,937 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:01,576 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:23:03,268 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:05,886 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:07,003 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:23:07,321 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:07,990 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()

2025-10-28 11:23:09,959 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:10,892 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:11,733 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Output()


KeyboardInterrupt


KeyboardInterrupt

2025-10-28 11:23:12,962 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-28 11:23:13,670 - INFO - HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
