In [1]:
from pathlib import Path

directory = r"C:\Users\benja\startup_projects\civgen\rag\dmv_site_data_base"
def find_json_files(directory: str) -> list:
    """
    Recursively find all JSON files in a directory and its subdirectories
    
    Args:
        directory (str): Root directory to search
        
    Returns:
        list: Path objects for found JSON files
    """
    path = Path(directory)
    return list(path.rglob("*.json"))

json_paths = find_json_files(directory)

In [12]:
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
from pydantic import BaseModel

class UserQuestions(BaseModel):
  questions: list[str]

load_dotenv(dotenv_path="c:/Users/benja/startup_projects/civgen/.env")
openai_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

question_jsons = []
for json_path in json_paths:
  try:
    with open(json_path, 'r', encoding='utf-8') as file: 
      data = json.load(file)
      text_content = data.get("text_content", "")
      if not text_content:
        print(f"failed to get text content")
      else: 
        completion = openai_client.beta.chat.completions.parse(
          model = "gpt-4o-2024-11-20",
          messages=[
            {"role": "system", "content": "You are generating questions that a customer of the Virginia DMV might have. "
                                          "Please generate questions that are answered in the text. "
                                          "Please generate a maximum of 5 questions, generating less if the content of the text has been covered. "
                                          "Do not generate questions in which the answers are not in the text. "},
            {"role": "user", "content": text_content},
          ],
          response_format = UserQuestions,
        )
        event = completion.choices[0].message.parsed
        
        # Get the relative path by removing the directory prefix
        relative_path = os.path.relpath(json_path, directory)

        # Store the results
        question_dict = {
            "page": relative_path,  # Store the relative path instead of title
            "questions": event.questions
        }
        print(f"{relative_path}")
        question_jsons.append(question_dict)

  except FileNotFoundError:
    print(f"Error: File not found -> {json_path}")
  except json.JSONDecodeError:
    print(f"Error: Failed to decode JSON in file -> {json_path}")
  except KeyError as e:
    print(f"Error: Missing key {e} in file -> {json_path}")
  except Exception as e:
    print(f"Unexpected error processing {json_path}: {e}")


        


root\exemp-disc-chart.json
root\homepage.json
root\vrtp.json
businesses\become-dmv-select\homepage.json
businesses\dealer-services\car-and-truck-filing-fees-and-plate-requirements.json
businesses\dealer-services\franchise-motorcycle.json
businesses\dealer-services\franchise-trailer.json
businesses\dealer-services\homepage.json
businesses\dealer-services\licensing.json
businesses\dealer-services\mnfdist.json
businesses\dealer-services\motor-home-and-travel-trailer-filing-franchises.json
businesses\dealer-services\salvage-process-a.json
businesses\dealer-services\salvage-process-b.json
businesses\dealer-services\salvage-process-d.json
businesses\hauling\blanket.json
businesses\hauling\free.json
businesses\hauling\homepage.json
businesses\insurance\acknowledge.json
businesses\insurance\certifications.json
businesses\insurance\ext-forms.json
businesses\insurance\frrequire.json
businesses\insurance\general-info.json
businesses\insurance\homepage.json
businesses\insurance\intro.json
business

In [13]:
output_file = "gpt-4o_test_questions.json"

# Save the collected questions to a JSON file
try:
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(question_jsons, file, indent=4, ensure_ascii=False)
    print(f"Questions successfully saved to {output_file}")
except Exception as e:
    print(f"Error saving questions to file: {e}")

Questions successfully saved to gpt-4_test_questions.json


In [3]:
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
from pydantic import BaseModel
import time

class UserQuestions(BaseModel):
  questions: list[str]

load_dotenv(dotenv_path="c:/Users/benja/startup_projects/civgen/.env")
gemini_client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

question_jsons = []
for json_path in json_paths:
  time.sleep(3)
  try:
    with open(json_path, 'r', encoding='utf-8') as file: 
      data = json.load(file)
      text_content = data.get("text_content", "")
      if not text_content:
        print(f"failed to get text content")
      else: 
        completion = gemini_client.beta.chat.completions.parse(
          model="gemini-2.0-flash",
          messages=[
            {"role": "system", "content": "You are generating questions that a customer of the Virginia DMV might have. "
                                          "Please generate questions that are answered in the text. "
                                          "Please generate a maximum of 5 questions, generating less if the content of the text has been covered. "
                                          "Do not generate questions in which the answers are not in the text."},
            {"role": "user", "content": text_content},
          ],
          response_format=UserQuestions,
        )
        event = completion.choices[0].message.parsed
        
        relative_path = os.path.relpath(json_path, directory)
        question_dict = {
            "page": relative_path,
            "questions": event.questions
        }
        print(f"{relative_path}")
        question_jsons.append(question_dict)

  except FileNotFoundError:
    print(f"Error: File not found -> {json_path}")
  except json.JSONDecodeError:
    print(f"Error: Failed to decode JSON in file -> {json_path}")
  except KeyError as e:
    print(f"Error: Missing key {e} in file -> {json_path}")
  except Exception as e:
    print(f"Unexpected error processing {json_path}: {e}")


root\exemp-disc-chart.json
root\homepage.json
root\vrtp.json
businesses\become-dmv-select\homepage.json
businesses\dealer-services\car-and-truck-filing-fees-and-plate-requirements.json
businesses\dealer-services\franchise-motorcycle.json
businesses\dealer-services\franchise-trailer.json
businesses\dealer-services\homepage.json
businesses\dealer-services\licensing.json
businesses\dealer-services\mnfdist.json
businesses\dealer-services\motor-home-and-travel-trailer-filing-franchises.json
businesses\dealer-services\salvage-process-a.json
businesses\dealer-services\salvage-process-b.json
businesses\dealer-services\salvage-process-d.json
businesses\hauling\blanket.json
businesses\hauling\free.json
businesses\hauling\homepage.json
businesses\insurance\acknowledge.json
businesses\insurance\certifications.json
businesses\insurance\ext-forms.json
businesses\insurance\frrequire.json
businesses\insurance\general-info.json
businesses\insurance\homepage.json
businesses\insurance\intro.json
business

In [4]:
output_file = "gemini-2.0-flash_test_questions.json"

# Save the collected questions to a JSON file
try:
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(question_jsons, file, indent=4, ensure_ascii=False)
    print(f"Questions successfully saved to {output_file}")
except Exception as e:
    print(f"Error saving questions to file: {e}")

Questions successfully saved to gemini-2.0-flash_test_questions.json
