In [None]:
import os
import time
import json
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI


load_dotenv('/Users/asze01/Code/Hassoun-Lab/GPT.env')
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please check your .env file.")

client = OpenAI(api_key=api_key)

# Currently are still placeholders
input_csv = "input_papers.csv"
output_csv = "output_papers_with_workflows.csv"

# Currently are still placeholders
full_text_col = "full_text"              # column 1 (?): text of the paper
workflow_output_col = "workflow_json"    # column 2 (? new): output template with workflow + more (depends)


The following chunk is for extracting workflows from papers. It is work in progress until the following conditions are met:
- the 20 manually annotated papers are done -> JSON template and prompt can be fully optimized
- All papers (and their figures) are processed into text -> full text for extraction is ready

In [None]:
# Prompt for extracting workflow
# Should be iterated upon after testing to see if the workflows extracted meet the standard of our manual extractions
workflow_prompt_instructions = """
You are an expert in untargeted metabolomics and workflow design.

Given the full text of a metabolomics paper, extract ONLY the untargeted metabolomics workflow
used in the study. Focus on the main experimental and computational steps, in execution order.
The workflow you extract should be detailed enough for a researcher to read and carry out.
Do not omit any details directly relevant to the workflow. Include any relevant tools/APIs/databases
used in the workflow.

Guidelines:
- Include only steps that are explicitly described or clearly implied from the text.
- Do NOT invent tools, databases, or steps that are not supported by the paper.
- Use concise, technical language suitable for a computational systems biology researcher.
- If something is missing or unclear in the paper, mark it as "unspecified" rather than guessing.

Return your answer as JSON following the provided schema exactly.
"""

# JSON Schema (modify after we do the 20 annotated examples)
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "metabolomics_workflow_extraction",
        "schema": {
            "type": "object",
            "properties": {
                "paper_has_untargeted_metabolomics": {"type": "boolean"},
                "workflow_steps": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "step_number": {"type": "integer"},
                            "step_name": {"type": "string"},
                            "description": {"type": "string"},
                            "category": {
                                "type": "string",
                                "description": "High-level category (e.g., sample prep, LC-MS acquisition, preprocessing, feature extraction, normalization, statistics, annotation, pathway analysis)"
                            },
                            "tools_software": {
                                "type": "array",
                                "items": {"type": "string"}
                            },
                            "databases_apis": {
                                "type": "array",
                                "items": {"type": "string"}
                            },
                            "inputs": {
                                "type": "array",
                                "items": {"type": "string"}
                            },
                            "outputs": {
                                "type": "array",
                                "items": {"type": "string"}
                            },
                            "is_explicit_in_paper": {
                                "type": "boolean",
                                "description": "True if this step is explicitly described; false if strongly implied."
                            }
                        },
                        "required": [
                            "step_number",
                            "step_name",
                            "description",
                            "category",
                            "tools_software",
                            "databases_apis",
                            "inputs",
                            "outputs",
                            "is_explicit_in_paper"
                        ]
                    }
                },
                "unspecified_or_omitted_steps": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "Important steps that seem missing or under-specified."
                },
                "notes_on_ambiguity": {
                    "type": "string",
                    "description": "Short explanation of any ambiguities or uncertainties in the extracted workflow."
                }
            },
            "required": [
                "paper_has_untargeted_metabolomics",
                "workflow_steps",
                "unspecified_or_omitted_steps",
                "notes_on_ambiguity"
            ]
        }
    }
}

In [None]:
# Call the GPT model to extract the metabolomics workflow from a paper's full text.
# Returns a Python dict following the response_format JSON schema.
# On failure, returns a dict with error information.

def extract_workflow_from_full_text(full_text: str,
                                    max_retries: int = 3,
                                    retry_delay: float = 5.0):

    user_prompt = (
        f"{workflow_prompt_instructions.strip()}\n\n"
        "Full paper text:\n"
        f"{full_text.strip()}\n"
    )

    for attempt in range(1, max_retries + 1):
        try:
            response = client.chat.completions.create(
                model="gpt-5-nano",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a precise assistant that does not hallucinate or create new information for extracting workflows from scientific papers."
                    },
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.0,
                response_format=response_format
            )
            raw = response.choices[0].message.content.strip()
            # Because response_format enforces JSON, we can parse directly
            result = json.loads(raw)
            return result

        except Exception as e:
            print(f"[Attempt {attempt}] Error extracting workflow: {e}")
            if attempt == max_retries:
                # Give up and return an error payload
                return {
                    "paper_has_untargeted_metabolomics": False,
                    "workflow_steps": [],
                    "unspecified_or_omitted_steps": [],
                    "notes_on_ambiguity": f"Extraction failed after {max_retries} attempts: {e}"
                }
            time.sleep(retry_delay)

In [None]:
def main():
    df = pd.read_csv(input_csv)

    if full_text_col not in df.columns:
        raise ValueError(f"Column '{full_text_col}' not found in input CSV.")

    df[workflow_output_col] = None

    for idx, row in df.iterrows():
        full_text = str(row[full_text_col]).strip()

        if not full_text:
            print(f"Row {idx}: empty full_text, skipping.")
            df.at[idx, workflow_output_col] = json.dumps({
                "paper_has_untargeted_metabolomics": False,
                "workflow_steps": [],
                "unspecified_or_omitted_steps": [],
                "notes_on_ambiguity": "No full text provided."
            })
            continue

        print(f"Processing row {idx} (first 80 chars): {full_text[:80].replace('\\n', ' ')}...")

        workflow_result = extract_workflow_from_full_text(full_text)

        df.at[idx, workflow_output_col] = json.dumps(workflow_result, ensure_ascii=False)

    df.to_csv(output_csv, index=False)
    print(f"Saved augmented CSV to: {output_csv}")


if __name__ == "__main__":
    main()