In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv("../.env")

client = OpenAI()

    
response = client.chat.completions.create(
    model= "gpt-4.1-nano",
    messages=[
        {"role": "system", "content": "You are a medical research assistant for a BioASQ competition."},
        {"role": "user", "content": "Explain why Hirschsprung disease is considered multifactorial."}
    ],
    temperature=0
)

print(response.choices[0].message.content)

Hirschsprung disease (HD) is considered multifactorial because its development is influenced by a combination of genetic, environmental, and possibly epigenetic factors. Here are the key reasons why it is classified this way:

1. **Genetic Factors**: Hirschsprung disease is associated with mutations in several genes, most notably the RET proto-oncogene, which plays a crucial role in the development of the enteric nervous system. Variants in other genes, such as EDNRB, ECE1, and others, have also been implicated. However, not all individuals with these genetic mutations develop the disease, indicating that genetics alone do not determine the outcome.

2. **Environmental Influences**: Environmental factors may also contribute to the risk of developing Hirschsprung disease. These can include maternal health conditions, exposure to certain drugs or toxins during pregnancy, and other prenatal factors that may affect fetal development.

3. **Familial Patterns**: While Hirschsprung disease ca

In [8]:
# context = " ".join([s['text'] for s in first_q['snippets']])
# prompt = f"Context: {context}\n\nQuestion: {first_q['body']}"


In [9]:
import json
import yaml
import hashlib
from datetime import datetime
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv

# 1. Setup paths and directories
BATCH_DIR = Path("sample_batches")
BATCH_DIR.mkdir(exist_ok=True)  # Creates the folder if it doesn't exist

# 2. Get the current timestamp (sortable format)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# 3. Load config and setup experiment hash
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

prompt_cfg = config['prompts']['ontology_builder']
model_name = config['model']['deployment_name']
exp_hash = hashlib.md5(f"{model_name}_{prompt_cfg['system_prompt']}".encode()).hexdigest()[:8]

# 4. Define the final file name with timestamp
batch_filename = BATCH_DIR / f"batch_{timestamp}_{exp_hash}.jsonl"

# 5. Sample Data and Task Generation
samples = [
    {"id": "s1", "text": "Hello world"},
    {"id": "s2", "text": "Hi there"}
]

tasks = []
for item in samples:
    task = {
        "custom_id": f"{exp_hash}_{item['id']}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model_name,
            "messages": [
                {"role": "system", "content": prompt_cfg['system_prompt']},
                {"role": "user", "content": item['text']}
            ],
            "temperature": 0
        }
    }
    tasks.append(task)

# 6. Save to the new directory
with open(batch_filename, "w") as f:
    for t in tasks:
        f.write(json.dumps(t) + "\n")

print(f"üìÇ Batch file saved to: {batch_filename}")

üìÇ Batch file saved to: sample_batches/batch_20260212_225517_aa22f136.jsonl


In [10]:
# # Upload the file
# batch_file_obj = client.files.create(
#     file=open(batch_filename, "rb"),
#     purpose="batch"
# )

# # Create the batch
# batch_job = client.batches.create(
#     input_file_id=batch_file_obj.id,
#     endpoint="/v1/chat/completions",
#     completion_window="24h",
#     metadata={
#         "experiment_id": exp_id
#     }
# )

# print(f"üöÄ Batch Job Created! ID: {batch_job.id}")

In [11]:
import pandas as pd
import os
from pathlib import Path

LOG_FILE = "logs/batch_log.csv"
Path("logs").mkdir(exist_ok=True)

def log_experiment(data_dict):
    """Appends experiment metadata to a CSV file."""
    df = pd.DataFrame([data_dict])
    if not os.path.isfile(LOG_FILE):
        df.to_csv(LOG_FILE, index=False)
    else:
        df.to_csv(LOG_FILE, mode='a', header=False, index=False)

def get_logged_experiments():
    """Returns the log as a DataFrame."""
    if os.path.isfile(LOG_FILE):
        return pd.read_csv(LOG_FILE)
    return pd.DataFrame()

In [12]:
import pandas as pd
import os
from pathlib import Path

LOG_FILE = "logs/batch_log.csv"
Path("logs").mkdir(exist_ok=True)

def log_experiment(data_dict):
    """Appends experiment metadata to a CSV file."""
    df = pd.DataFrame([data_dict])
    if not os.path.isfile(LOG_FILE):
        df.to_csv(LOG_FILE, index=False)
    else:
        df.to_csv(LOG_FILE, mode='a', header=False, index=False)

def get_logged_experiments():
    """Returns the log as a DataFrame."""
    if os.path.isfile(LOG_FILE):
        return pd.read_csv(LOG_FILE)
    return pd.DataFrame()

In [13]:
import json
import hashlib
from datetime import datetime

def submit_batch(samples, prompt_cfg, config):
    client = OpenAI()
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    model_name = config['model']['deployment_name']
    
    # 1. Generate Idempotent Hash
    sys_prompt = prompt_cfg['system_prompt']
    exp_hash = hashlib.md5(f"{model_name}_{sys_prompt}".encode()).hexdigest()[:8]
    exp_id = f"{config['experiments']['active_id']}_{exp_hash}"

    # 2. Create the JSONL file
    batch_path = Path("sample_batches") / f"batch_{exp_hash}_{datetime.now().strftime('%H%M%S')}.jsonl"
    batch_path.parent.mkdir(exist_ok=True)
    
    with open(batch_path, "w") as f:
        for item in samples:
            task = {
                "custom_id": f"{exp_id}_{item['id']}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {"model": model_name, "messages": [
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": item['text']}
                ], "temperature": 0}
            }
            f.write(json.dumps(task) + "\n")

    # 3. Submit
    file_obj = client.files.create(file=open(batch_path, "rb"), purpose="batch")
    batch_job = client.batches.create(
        input_file_id=file_obj.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    # 4. Log to CSV
    log_experiment({
        "batch_id": batch_job.id,
        "exp_id": exp_id,
        "status": "in_progress",
        "created_at": timestamp,
        "model": model_name,
        "system_prompt": sys_prompt[:50] + "...", # Truncate for CSV readability
        "file_path": str(batch_path),
        "output_file_id": None,
        "input_tokens": 0,
        "output_tokens": 0
    })
    print(f"üöÄ Submitted Batch: {batch_job.id}")

In [14]:
def sync_and_download_results():
    client = OpenAI()
    logs = get_logged_experiments()
    if logs.empty: return

    updated = False
    for idx, row in logs.iterrows():
        if row['status'] not in ['completed', 'failed']:
            job = client.batches.retrieve(row['batch_id'])
            logs.at[idx, 'status'] = job.status
            
            if job.status == 'completed':
                # Download and save the results
                logs.at[idx, 'output_file_id'] = job.output_file_id
                logs.at[idx, 'input_tokens'] = job.usage.input_tokens
                logs.at[idx, 'output_tokens'] = job.usage.output_tokens
                
                # Fetch the actual JSONL content
                content = client.files.content(job.output_file_id).text
                result_path = Path("results") / f"results_{row['batch_id']}.jsonl"
                result_path.parent.mkdir(exist_ok=True)
                
                with open(result_path, "w") as f:
                    f.write(content)
                print(f"‚úÖ Batch {row['batch_id']} COMPLETED. Results saved to {result_path}")
            
            else:
                print(f"‚è≥ Batch {row['batch_id']} is still {job.status}...")
            updated = True

    if updated:
        logs.to_csv(LOG_FILE, index=False)

# Run this to check status
sync_and_download_results()

In [15]:
# results_df = pd.read_json(f"results/results_{batch_id}.jsonl", lines=True)?