Prompt for llm from huggingface. 
Pulls the .csv of the datafile on local computer and outputs and answer. Compares with correct answer. Saves to new dataframe. 

In [None]:
import requests
import re
import pandas as pd

# Replace with your deployed API URL
API_URL = "BLANK"

# Replace with your Hugging Face API key
HF_TOKEN = "BLANK"

# API Headers
HEADERS = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json"
}

# Load the dataset
file_path = "/Users/alexlawson/Documents/GitHub/medical-llms-bias/shortened.csv" # Replace with the path to your dataset
df = pd.read_csv(file_path)

# Function to format the prompt
def format_prompt(question, a, b, c, d):
    return (
        f"Answer the following multiple choice question. Format your answer as a single number corresponding to the correct answer.\n"
        f"{question}\n"
        f"1. {a}\n"
        f"2. {b}\n"
        f"3. {c}\n"
        f"4. {d}\n\n"
        f"Your Answer: "
    )

# Function to query the LLM
def query_llm(prompt):
    data = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 5,  # Forces a short response
            "temperature": 0.1,  # Ensures deterministic output
            "top_p": 0.1,  # Reduces variability
            "do_sample": False,
            "eos_token_id": 50256
        }
    }
    try:
        response = requests.post(API_URL, headers=HEADERS, json=data)
        response_json = response.json()

        # Extract the response text
        if isinstance(response_json, list) and len(response_json) > 0:
            response_text = response_json[0].get("generated_text", "").strip()
        elif isinstance(response_json, dict) and "generated_text" in response_json:
            response_text = response_json["generated_text"].strip()
        else:
            raise KeyError("Unexpected response format")

        # Use regex to extract the number following "Your Answer: "
        match = re.search(r"Your Answer:\s*([1-4])\.", response_text)
        if match:
            return match.group(1)  # Extract the number (group 1 from the regex)
        else:
            return None  # No valid answer found
    except Exception as e:
        print(f"Error querying LLM: {e}")
        return None

# Process the dataset
results = []
for _, row in df.iterrows():
    question = row["Augmented_Question"]
    opa = row["opa"]
    opb = row["opb"]
    opc = row["opc"]
    opd = row["opd"]
    correct_answer = str(int(row["cop"]) + 1)  # Ensure the correct answer is a string

    # Format the prompt
    prompt = format_prompt(question, opa, opb, opc, opd)

    # Query the LLM
    llm_answer = query_llm(prompt)

    # Check if the LLM's answer is correct
    is_correct = llm_answer == correct_answer

    # Append the result
    results.append({
        "Question": question,
        "Correct Answer": correct_answer,
        "LLM Answer": llm_answer,
        "Is Correct": is_correct
    })

# Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame(results)
output_file = "/Users/alexlawson/Documents/GitHub/medical-llms-bias/results.csv"  # Replace with the desired output path
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")