[Source: Basic_Rag_Example.ipynb](https://gitlab.k8s.cloud.statcan.ca/bptas/initiatives/llm-the-zone/-/blob/main/Basic_Rag_Example.ipynb?ref_type=heads)

Install required packages:
- pip install scikit-learn
- pip install azure-identity
- pip install PyPDF2
- pip install dotenv
- pip install openai

In [None]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv, dotenv_values
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from google import genai
from google.genai import types


In [None]:
print(f"Current Working Directory: {os.getcwd()}")
os.chdir("/home/bex/Documents/git/genai_usecase/")
print(f"Current Working Directory: {os.getcwd()}")

In [None]:
# 1. Define the folder path
folder_path = "data/input/"

# 2. Initialize an empty list
pdf_files_list = []

# 3. Loop through all items in the directory
for f in os.listdir(folder_path):
    # Check if the item ends with '.pdf' and is a file
    # if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(folder_path, f)) and f.startswith('ac') and 'short' not in f:
    if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(folder_path, f)) and f.startswith('ac'):
        # Append the full path to the list
        full_path = os.path.join(folder_path, f)
        pdf_files_list.append(full_path)


print(f"Total PDF files found: {len(pdf_files_list)}")
print("\nList of full paths:")
for path in pdf_files_list[:3]: # Print first 3 for brevity
    print(path)

In [None]:
class pdfRagExtractor:
    def __init__(self, question, pdf_path):
        self.PDF_PATH = pdf_path
        self.question = question 

    def chunk_text(self, max_len=1000, overlap=200):
        # Read & extract text from the PDF
        PDF_PATH = self.PDF_PATH
        reader = PdfReader(PDF_PATH)
        pages = [page.extract_text() or "" for page in reader.pages]
        text = "\n".join(pages)

        # Chunk the text into overlapping windows
        tokens = text.split()
        chunks = []
        start = 0
        while start < len(tokens):
            end = min(start + max_len, len(tokens))
            chunks.append(" ".join(tokens[start:end]))
            if end == len(tokens):
                break
            start = end - overlap

        # Build a TF-IDF retriever over the chunks
        vectorizer = TfidfVectorizer().fit(chunks)
        chunk_vectors = vectorizer.transform(chunks)
        return chunks, vectorizer, chunk_vectors

    def retrieve_top_k(self, question, k=5):
        chunks, vectorizer, chunk_vectors = self.chunk_text()
        q_vec = vectorizer.transform([question])
        # Cosine similarity via dot product (TF-IDF is L2-normalized by default)
        scores = (chunk_vectors @ q_vec.T).toarray().squeeze()
        top_idxs = np.argsort(scores)[-k:][::-1]
        return [chunks[i] for i in top_idxs]

    # RAG query function using your chat deployment
    def rag_query(self, top_k: int = 5) -> str:
        question = self.question

        snippets = self.retrieve_top_k(question, top_k)

        context = "\n\n---\n\n".join(snippets)

        prompt = f"""
            Based on the following text:
            ---
            {context}\n\n
            ---
            Provide the answer to the following question in a single numeric value without any words, if not found set answer to NA.
            {question}\n\n
        """

        print("Sending a completion job with retrieved context…")
        # print(prompt)
        client = genai.Client()

        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
            ),
        )
        print(response.text)

        # # Response from OpenAI
        # # print(response.choices[0].message.content)
        return response.text


In [None]:
# create dictionary with questions
q_dict = {
  "q1": "In Pensions and other benefit liabilities, what is the company's fair value of plan assets for the most recent fiscal year for pension benefits plans?",
  "q2": "In Pensions and other benefit liabilities, what is the company's defined benefit obligation as of the end of fiscal year for pension benefits plans?",
  "q3": "In Pensions and other benefit liabilities, what is the value of employer pension contributions for the most recent fiscal year?",
  "q4": "In Pensions and other benefit liabilities, what is the value of employee pension contributions for the most recent fiscal year?",
  "q5": "In Pensions and other benefit liabilities, what is the current service cost for the most recent reporting date?",
  "q6": "What is the current pension administrative expenses for the most recent reporting date?"
}

In [None]:
# Create dictionary to collect results
re_dict = {
  "files": []
}

# Empty list to collect records
records = []

In [None]:
for pdf_file in pdf_files_list[:2]:
  # Create a dictionary for the new file (e.g., 2022 data)
  file_name = os.path.basename(pdf_file)

  print(f"Processing {file_name}")

  new_file_record = {
    file_name: {}
  }

  for q in q_dict:
    # print(q)
    RagCall = pdfRagExtractor(question= q_dict[f"{q}"], pdf_path=pdf_file)
    RagResult = RagCall.rag_query()

    # Initialize the nested question dictionary
    # You must create the inner dictionary before assigning its keys.
    # The variable 'q' (e.g., "q1") becomes the new key.
    new_file_record[file_name][q] = {}

    # Add question and value
    new_file_record[file_name][q]["question"] = q_dict[f"{q}"]
    new_file_record[file_name][q]["value"] = RagResult

    # Create a new dictionary for each row
    row = {
        'file_name': file_name,
        'question_id': q,
        'question': q_dict[f"{q}"],
        'value': RagResult
    }

    records.append(row)
  
  # Append the New Record to the 'files' List
  re_dict['files'].append(new_file_record)

  # print(f"Finished {file_name}")


In [None]:
# Convert the list of records to a DataFrame ---
df = pd.DataFrame(records)

# Write the DataFrame to a CSV file
df.to_csv("data/output/output_data.csv", index=False)

In [None]:
# Dump as JSON (optional)
import json

output_file = 'data/output/report_data.json'

with open(output_file, 'w') as f:
    # Use json.dump(dictionary, file_object)
    json.dump(re_dict, f, indent=4)

In [None]:
# Test specific questions
RagCall = pdfRagExtractor(question= q_dict["q5"], pdf_path=pdf_files_list[3])
RagResult = RagCall.rag_query()