In [3]:
from openai import OpenAI
import pandas as pd
import json
import os

In [27]:


# Load API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    api_key= api_key,
)

def make_valid_filename(name):
    return "".join(c if c.isalnum() else "_" for c in name)

# Function to read the CSV file and load the data
def load_service_data(csv_path):
    return pd.read_csv(csv_path)

# Function to read the raw text from files
def load_raw_texts(file_folder):
    raw_texts = {}
    for filename in os.listdir(file_folder):
        if filename.endswith('.txt'):
            service_name = make_valid_filename(filename.replace('.txt', ''))
            with open(os.path.join(file_folder, filename), 'r', encoding='utf-8') as file:
                raw_texts[service_name] = file.read()
    return raw_texts

# Function to truncate text to fit within token limits
def truncate_text(text, max_tokens):
    tokens = text.split()
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return ' '.join(tokens)

# Function to generate questions using OpenAI GPT API
def generate_questions(main_text, raw_text):
    max_tokens = 9000  # Adjust as necessary to stay well below the limit
    truncated_main_text = truncate_text(main_text, max_tokens // 2)
    truncated_raw_text = truncate_text(raw_text, max_tokens // 2)

    prompt = (
        f"Based on the website text and the services described: '{truncated_main_text}', create 5 simple questions that users might ask about these services. "
        "Ignore surrounding website details. "
        "Questions should be in plain language and no more than 7 words long. "
        "Each question must include terms related to the services, using the service name or related terms. "
        "For example, if the service is 'Apply for Health First Colorado and Child Health Plan Plus', use terms like 'Health First Colorado', 'medical assistance', 'health benefits', or 'health coverage'. "
        "Avoid vague questions like 'Where can I check my application status?'. "
        "Revise the questions to ensure they are specific to the services provided."
    )

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": truncated_raw_text},
        ],
    )

    return response.choices[0].message.content

# Function to create the JSON output
def create_json_output(service_data, raw_texts, output_path, num_services):
    output = []
    for idx, row in service_data.iterrows():
        if idx >= num_services:
            break

        main_text = row['mainText']
        main_url = row['mainURL']
        dept_name = row['deptName']
        dept_url = row['deptURL']
        service_key = main_text.replace(' ', '_')

        if service_key in raw_texts:
            raw_text = raw_texts[service_key]
            questions = generate_questions(main_text, raw_text)
            output.append({
                "mainText": main_text,
                "mainURL": main_url,
                "deptName": dept_name,
                "deptURL": dept_url,
                "questions": questions
            })

    with open(output_path, 'w') as outfile:
        json.dump(output, outfile, indent=4)

# Main function to orchestrate the process
def main():
    csv_path = 'colorado_services.csv'
    file_folder = 'pages'
    output_path = 'service_questions.json'
    num_services = 1000

    service_data = load_service_data(csv_path)
    raw_texts = load_raw_texts(file_folder)
    create_json_output(service_data, raw_texts, output_path, num_services)
    print(f"Output saved to {output_path}")

# Run the main function
if __name__ == "__main__":
    main()


Output saved to service_questions.json
