In [1]:
!pip install pydantic_ai

Collecting pydantic_ai
  Downloading pydantic_ai-0.3.1-py3-none-any.whl.metadata (11 kB)
Collecting pydantic-ai-slim==0.3.1 (from pydantic-ai-slim[a2a,anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.3.1->pydantic_ai)
  Downloading pydantic_ai_slim-0.3.1-py3-none-any.whl.metadata (3.8 kB)
Collecting eval-type-backport>=0.2.0 (from pydantic-ai-slim==0.3.1->pydantic-ai-slim[a2a,anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.3.1->pydantic_ai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting griffe>=1.3.2 (from pydantic-ai-slim==0.3.1->pydantic-ai-slim[a2a,anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.3.1->pydantic_ai)
  Downloading griffe-1.7.3-py3-none-any.whl.metadata (5.0 kB)
Collecting opentelemetry-api>=1.28.0 (from pydantic-ai-slim==0.3.1->pydantic-ai-slim[a2a,anthropic,bedrock,cli,cohere,evals,google,groq,mcp,mistral,openai,vertexai]==0.3.1->pydantic_ai

In [2]:
from dataclasses import dataclass
from pydantic import BaseModel, Field
from pydantic_ai import Agent, RunContext
from typing import List, Dict
import yaml
import os
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code/

/content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code


In [5]:
os.environ['GEMINI_API_KEY'] = ''
os.environ['OPENAI_API_KEY'] = ''

In [6]:
system_prompt = '''
# Biomedical prompt generator
I am an advanced AI agent specialized on generating accurate prompts based on data provided to me (clinical, molecular, demographics omics).
I am designed to create reliable and accurate descriptions of the data that is provided. In particular, all questions are derived from the actual values
from the table and only those values.

# General capabilities
## Prompt generator
'''

In [18]:
class Output(BaseModel):
    context: str = Field(description="Generate a paragraph of up to 8 sentences with relevant patient information based specifically on the context input data, use your knowledge in the area to connect the information provided and add links between the data. use different versions to collate the context, but always be truthful to what is available")
    question: str = Field(description='Generate a question based on a selected field from a set of provided fields that are described to be used as part of the question. Dont add complex questions, and only ask for one field at the time')
    options: List = Field(description='Add 4 options to the selected field, use your own informatin to add the options and they can include related but wrong options and the real option')
    answer: str = Field(description='return the correct answer.')

class OutputList(BaseModel):
    prompts: List[Output] = Field(description="Generate 10 different prompts")

In [19]:
agent = Agent(
    # 'gemini-2.0-flash-lite',
    'openai:gpt-4.1-nano-2025-04-14',
    system_prompt=system_prompt,
    output_type=OutputList,
    Instrument=True
)

In [20]:
data = pd.read_csv('../data/msk_chord_2024/data_prompts.csv', sep=',')
# data.head()

In [32]:
for ix, i in enumerate(data[:20].itertuples()):
  query = f"""
  You are provided with the following data and their usage:

  For context use only the following data:
  * cancer with mutation sin the following genes: {i.MUTATIONS}.
  * Age of the patient: {i.CURRENT_AGE_DEID}
  * Treatment history: {i.TREATMENT_HISTORY}
  * HER2 (relevant in cancers like breast cancer): {i.HER2}
  * Cancer stage: {i.STAGE_HIGHEST_RECORDED}
  * Gender: {i.GENDER}
  * Smoking history: {i.SMOKING_PREDICTIONS_3_CLASSES}
  * History of PDL-1: {i.HISTORY_OF_PDL1}
  * Fraction Genome altered: {i.Fraction_Genome_Altered}
  * MSI Type: {i.MSI_Type}
  * Mutation Count: {i.Mutation_Count}

  For Generating the questions use only and only the following information:
  * metastatic site: {i.METASTASIS_SITES}
  * Overall Survival in months: {i.OS_MONTHS}
  * Overal Survival Status: {i.OS_STATUS}
  * cancer type {i.CANCER_TYPE}
  * TMB (nonsynonymous): {i.TMB_nonsynonymous}
  * Tumor Purity: {i.Tumor_Purity}
  * Primary Tumor Site: {i.Primary_Tumor_Site}


  # Task:
  Generate ten detailed prompts using the provided data. If NAN encountered ignore that variable for generating the prompt.
  """
  result = await agent.run(query)

  break

In [34]:
result

AgentRunResult(output=OutputList(prompts=[Output(context='The patient is a 68-year-old female diagnosed with breast cancer, specifically at stage 1-3, with mutations in genes like TP53 and PIK3R3, among others. Her treatment history includes cisplatin, etoposide, carboplatin, and nivolumab, indicating a complex therapeutic approach. She is a former/current smoker with no HER2 overexpression and no prior PDL-1 testing. The cancer involved multiple metastatic sites, including intra-abdominal, lung, and lymph nodes.', question='What is the metastatic site involved in this breast cancer case?', options=['Intra Abdominal', 'Lung', 'Lymph Nodes', 'Other'], answer='Intra Abdominal'), Output(context='This breast cancer patient has an overall survival of approximately 118.45 months and is currently alive. The mutation count is one, and the tumor mutation burden (TMB) is 1.1091.', question='What is the overall survival time in months for this patient?', options=['118.45 months', '50 months', '20

In [23]:
print(query)


  You are provided with the following data and their usage: 

  For context use only the following data:
  * cancer with mutation sin the following genes: CCNE1, SDHA, PTPRT, BLM, AXIN2, PTPRD, ATR, FBXW7, KMT2C, KDM5A, SMARCA4, MAP3K1, FLT4, PIK3C2G, PIK3R3, CREBBP, HGF, PTPRS, PDGFRA, RET, TSHR, ATRX, DOT1L, BRIP1, NSD1, TP53.
  * Age of the patient: 68.0	
  * Treatment history: CISPLATIN, ETOPOSIDE, CARBOPLATIN, INVESTIGATIONAL, INVESTIGATIONAL, NIVOLUMAB
  * HER2 (relevant in cancers like breast cancer): No
  * Cancer stage: Stage 1-3
  * Gender: Female
  * Smoking history: Former/Current Smoker 
  * History of PDL-1: No 
  * Fraction Genome altered: 0.3146
  * MSI Type: Indeterminate
  * Mutation Count: 1.0
  
  For Generating the questions use only and only the following information:
  * metastatic site: Intra Abdominal, Lung, Lymph Nodes, Other 
  * Overall Survival in months: 118.4546647
  * Overal Survival Status: 0:LIVING
  * cancer type Breast Cancer
  * TMB (nonsynonymous)

In [14]:
for i in result.output.prompts:
    pr = f'''
Context:
{i.context}

Question:
{i.question}

Options:
{i.options}

Answer:
{i.answer}
    '''

    print(pr)


Context:
The patient is a 68-year-old female diagnosed with stage 1-3 breast cancer and has a mutation in genes such as TP53 and PTPRT. Her treatment history includes cisplatin, etoposide, carboplatin, and nivolumab, indicating a combination of chemotherapy and immunotherapy approaches. She is a former/current smoker with no HER2 overexpression and no PD-L1 history, suggesting certain immunotherapy considerations. The cancer has metastasized to sites including the intra-abdominal region, lungs, and lymph nodes, affecting her prognosis and treatment strategy. Her overall survival is approximately 118.45 months, and she is currently living, with a nonsynonymous TMB of 1.109. The tumor purity data is not available, adding some uncertainty to the molecular characterization of her tumor.

Question:
What is the primary tumor site for this patient?

Options: 
['Liver', 'Breast', 'Lung', 'Prostate']

Answer: 
Breast
    
