In [None]:
import json
with open('../data/nhs-conditions/v3/conditions.jsonl', 'r') as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [None]:
from dotenv import load_dotenv
from openai import AzureOpenAI
from pathlib import Path
import os

# Specify the path to your .env file
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

def get_env_var(key: str) -> str:
    try:
        return os.environ[key]
    except KeyError:
        raise KeyError(f"Please set the {key} environment variable.")

In [None]:
# Constants
MAX_TOKENS = 2048
AZURE_OPENAI_API_VERSION = "2024-12-01-preview"


def get_response_openai(prompt: str, model: str) -> str:
    endpoint = get_env_var(f"AZURE_OPENAI_ENDPOINT_{model}")
    key = get_env_var("AZURE_OPENAI_API_KEY")

    client = AzureOpenAI(
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=endpoint,
            api_key=key,
        )

    response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant.",
                },
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
            max_completion_tokens=MAX_TOKENS,
            model=model,
        )
    return response.choices[0].message.content

#get_response_openai("test", "gpt-4o")

In [None]:
def make_prompt(document: str) -> str:
    prompt_template = f"""
    Summarise the document below, focusing only on symptoms and how to decide the next course of action.
    Be concise â€” aim for a summary of 3-4 sentences or fewer, keeping only essential information. But make sure all symptoms are included.
    
    Document:
    {document}
    """
    return prompt_template


In [None]:
import tqdm
avg_reduction = []
for doc in tqdm.tqdm(data):
    try:
        condition_content = doc['condition_content']
        prompt = make_prompt(condition_content)
        response = get_response_openai(prompt, "gpt-4o")
        # compute how much we reduced the document
        reduction = (len(condition_content) - len(response)) / len(condition_content) * 100
        avg_reduction.append(reduction)
        doc['condition_summary'] = response
    except Exception as e:
        print(f"Error processing document {doc['condition_title']}: {e}")
        # remove the document from the list if it fails
        data.remove(doc)
if not os.path.exists('../data/nhs-conditions/v4/'):
    os.makedirs('../data/nhs-conditions/v4/')
with open('../data/nhs-conditions/v4/conditions.jsonl', 'w') as f:
    for doc in data:
        f.write(json.dumps(doc) + '\n')
print(f"Average reduction: {sum(avg_reduction) / len(avg_reduction)}%")