In [1]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.5.1-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.5.1


In [1]:
import ollama
import pandas as pd
from tqdm import tqdm
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code

/content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code


In [2]:
# data = pd.read_csv('../data/msk_chord_2024/data_prompts.csv', sep=',')
data = pd.read_csv('../../data/msk_chord_2024/data_prompts.csv', sep=',')

## Generate prompts - a fast approach
* We generate prompts by taking each patient data and transforming it into a text language description composed of the following:
  * Instructions: What the model needs to do
  * Context: This is a short description of the data and just adding the patient data as a list of items
  * Question: This is the interesting part. We are using a bank of questions derived from an LLM with questions. The idea is to have some variability on how the questions are made.
  * Options: Here we randomly selected options from the whole cohort and add the real option, therefore, this is composed of the true option and 3 more distractors
  * Answer: the letter with the answer.

**bank of questions**
This is a process where we created multiple versions of each variable that we want to make questions for. Basically, we just provide the variable name and the description and ask an LLM to create multiple versions of the questions. For instance, we provide with variable age and description as age of the patient. The questions can be: 1) what is the age of the patient?, 2) how old is the patient 3) how many years the patient has been alive, etc.

In [4]:
data.head()

Unnamed: 0,PATIENT_ID,GENDER,RACE,ETHNICITY,CURRENT_AGE_DEID,STAGE_HIGHEST_RECORDED,NUM_ICDO_DX,SMOKING_PREDICTIONS_3_CLASSES,GLEASON_FIRST_REPORTED,GLEASON_HIGHEST_REPORTED,...,TREATMENT_HISTORY,MUTATIONS,CANCER_TYPE,METASTATIC_SITES,Fraction_Genome_Altered,MSI_Type,Mutation_Count,Primary_Tumor_Site,TMB_nonsynonymous,Tumor_Purity
0,P-0000012,Female,White,Non-Spanish; Non-Hispanic,68.0,Stage 1-3,2,Former/Current Smoker,Not available,Not available,...,"CISPLATIN, ETOPOSIDE, CARBOPLATIN, INVESTIGATI...","PIK3C2G, FLT4, PIK3R3, BLM, MAP3K1, CCNE1, ATR...",Breast Cancer,"Intra Abdominal, Lung, Lymph Nodes",0.3146,Indeterminate,1.0,Breast,1.109155,
1,P-0000015,Female,White,Non-Spanish; Non-Hispanic,45.0,Stage 1-3,1,Unknown,Not available,Not available,...,"ZOLEDRONIC ACID, PACLITAXEL, CARBOPLATIN, GEMC...","ALK, ESR1, TP53, RNF43, CDK4, GATA3, PIK3CA",Breast Cancer,"Bone, Cns Brain, Liver, Lung, Lymph Nodes, Pleura",0.3503,Stable,7.0,Breast,7.764087,40.0
2,P-0000036,Female,Other,Non-Spanish; Non-Hispanic,68.0,Stage 4,1,Never,Not available,Not available,...,"CRIZOTINIB, CRIZOTINIB","ERBB2, NOTCH4, IRS1, TP53, FBXW7, AR, TSHR",Non-Small Cell Lung Cancer,"Bone, Liver, Lung, Lymph Nodes, Pleura",0.0185,Do not report,7.0,Lung,7.764087,30.0
3,P-0000041,Female,White,Non-Spanish; Non-Hispanic,53.0,Stage 1-3,1,Unknown,Not available,Not available,...,"PACLITAXEL PROTEIN-BOUND, ERIBULIN, INVESTIGAT...","INPP4B, PIK3C2G, ERBB4, FLT4, NF2, NOTCH4, TP5...",Breast Cancer,"Bone, Cns Brain, Liver, Pleura",0.4041,Indeterminate,10.0,Breast,11.091553,30.0
4,P-0000066,Female,White,Non-Spanish; Non-Hispanic,71.0,Stage 1-3,1,Never,Not available,Not available,...,"INVESTIGATIONAL, FULVESTRANT, INVESTIGATIONAL,...","CD276, TP53, ESR1",Breast Cancer,"Bone, Liver, Lung, Lymph Nodes, Pleura, Reprod...",0.2434,Do not report,4.0,Breast,4.436621,


In [5]:

def get_options_from_distribution(variable, n=4):
    if variable == 'METASTATIC_SITES':
        options = data['METASTATIC_SITES'].drop_duplicates().sample(n).values
        return options

    if variable == 'OS_MONTHS':
        return data.OS_MONTHS.sample(n).values

    # if variable == "OS_STATUS":
    #     return data.OS_STATUS.sample(n).values

    if variable == 'CANCER_TYPE':
        return data['CANCER_TYPE'].drop_duplicates().sample(n).values

    if variable == 'TMB_nonsynonymous':
        return data['TMB_nonsynonymous'].sample(n).values

    if variable == 'Tumor_Purity':
        return data['Tumor_Purity'].sample(n).values

    if variable == 'Primary_Tumor_Site':
        return data['Primary_Tumor_Site'].drop_duplicates().sample(n).values

In [6]:
options = data['METASTATIC_SITES'].drop_duplicates().sample(4).values
options

array(['Adrenal Glands, Bone, Lung, Lymph Nodes, Pleura',
       'Intra Abdominal, Liver, Reproductive Organs',
       'Bone, Liver, Lung, Lymph Nodes, Pleura, Reproductive Organs',
       'Adrenal Glands, Bone, Cns Brain, Lung, Reproductive Organs'],
      dtype=object)

## Generate a bank of questions from the variables provided

In [8]:
def bank_of_questions(variables):
    bank = []
    for variable in variables:
        question = f'''
        Instructions:
        Generate one simple question from the variable below for an individual patient.
        Do not interpret, explain, or infer anything beyond what is given.
        Do not add headers, asterisks, or quotes—just return the question in plain English.

        For example, if variable is "age", then question could be:
        "What is the age of the patient?"
        "How old is the patient?"

            Variable Name: {variable[0]}
            Variable Description: {variable[1]}
        '''

        response = ollama.generate(
            model='smollm2:latest',
            prompt=question
        )

        bank.append(dict(
            variable = variable[0],
            question = response['response'].strip()
        ))

    return bank

In [9]:
variables = ['METASTATIC_SITES', 'OS_MONTHS', 'CANCER_TYPE', 'TMB_nonsynonymous', 'Tumor_Purity', 'Primary_Tumor_Site']
descriptions = [
    'metastatic sites in a cancer patient',
    'the overal survival in months',
    'the type of cancer diagnosed in the patient',
    'tumor mutation burden of non-synonymous mutations',
    'tumor purity: proportion of malignant cells in the sample',
    'primary tumor site where the cancer originated'
]
bank2 = bank_of_questions(list(zip(variables, descriptions))*50)

In [10]:
qbank3 = pd.DataFrame(bank2)

In [11]:
qbank3.shape

(300, 2)

In [12]:
bank_unique = qbank3.copy()
bank_unique = bank_unique.drop_duplicates(subset=['variable', 'question']).reset_index(drop=True)
bank_unique['variable'].value_counts()

variable
TMB_nonsynonymous     49
Primary_Tumor_Site    49
Tumor_Purity          46
METASTATIC_SITES      45
OS_MONTHS             42
CANCER_TYPE           16
Name: count, dtype: int64

In [13]:
bank_unique.to_csv('../../data/questions_bank3.csv', index=False)

In [14]:
bank_unique

Unnamed: 0,variable,question
0,METASTATIC_SITES,"""What are the metastatic sites in the cancer p..."
1,OS_MONTHS,"""What is the overall survival time in months f..."
2,CANCER_TYPE,"""What type of cancer was diagnosed in the pati..."
3,TMB_nonsynonymous,"""What is the tumor mutation burden of non-syno..."
4,Tumor_Purity,What is the proportion of malignant cells in t...
...,...,...
242,METASTATIC_SITES,"""Where are the cancer cells spreading to, in t..."
243,OS_MONTHS,"""For how many months has the patient been unde..."
244,TMB_nonsynonymous,"""What is the level of tumor mutation burden fo..."
245,Tumor_Purity,"""What is the percentage of malignant cells in ..."


## Generate prompts

In [22]:
variables = ['METASTATIC_SITES', 'OS_MONTHS', 'CANCER_TYPE', 'TMB_nonsynonymous', 'Tumor_Purity', 'Primary_Tumor_Site']
prompts = []
options_id = ['[A]', '[B]', '[C]', '[D]']
for ix, i in tqdm(data.iterrows()):
    for replicate in range(5):
        for variable in variables:

            context_variables = [
                ['Mutations history', i.MUTATIONS],
                ['Age of the patient', i.CURRENT_AGE_DEID],
                ['Treatment history', i.TREATMENT_HISTORY],
                ['HER2 status', i.HER2],
                ['Cancer stage', i.STAGE_HIGHEST_RECORDED],
                ['Gender', i.GENDER],
                ['Smoking history', i.SMOKING_PREDICTIONS_3_CLASSES],
                ['History of PDL-1', i.HISTORY_OF_PDL1],
                ['Fraction Genome altered', i.Fraction_Genome_Altered],
                ['MSI Type', i.MSI_Type],
                ['Mutation Count', i.Mutation_Count],
                ['Number of tumor diagnoses (ICD-O codes)', i.NUM_ICDO_DX],
                ['Overal Survival Status', i.OS_STATUS]
            ]

            context_variables = [f"{i[0]}: {i[1]}" for i in context_variables]
            np.random.shuffle(context_variables)
            context_variables[0] = " * "+context_variables[0]
            context_variables = "\n * ".join(context_variables)

            context = f"""Instruction: You are a helpful medical assistant. 
Based on the following patient data, answer the multiple-choice question by selecting the correct letter ([A], [B], [C], or [D]).
Context: This is a cancer patient with the following characteristics:
{context_variables}"""

            question = bank_unique.query(f'variable == "{variable}"').sample(1).question.values[0]

            options = get_options_from_distribution(variable)
            answer = i[variable]

            options = list(options[options != answer][:3]) + [answer]
            np.random.shuffle(options)
            index_option = np.argmax(np.array(options) == answer)
            options = "".join([f' {i}: {j}\n' for i,j in zip(options_id, options)])
            answer = options_id[index_option]

            text = f"""{context}
Question: {question}
Options (choose ONLY one option letter): \n{options}
Answer:"""

            prompts.append(dict(
                PATIENT_ID=i.PATIENT_ID,
                variable=variable,
                prompt=text,
                answer=answer,
                replicate=replicate
            ))

23777it [05:34, 71.01it/s]


In [23]:
print(prompts[0]['prompt'])

Instruction: You are a helpful medical assistant. 
Based on the following patient data, answer the multiple-choice question by selecting the correct letter ([A], [B], [C], or [D]).
Context: This is a cancer patient with the following characteristics:
 * Age of the patient: 68.0
 * HER2 status: No
 * MSI Type: Indeterminate
 * Overal Survival Status: 0:LIVING
 * Cancer stage: Stage 1-3
 * Smoking history: Former/Current Smoker
 * Fraction Genome altered: 0.3146
 * History of PDL-1: No
 * Number of tumor diagnoses (ICD-O codes): 2
 * Mutation Count: 1.0
 * Gender: Female
 * Mutations history: PIK3C2G, FLT4, PIK3R3, BLM, MAP3K1, CCNE1, ATR, BRIP1, TSHR, SDHA, RET, PTPRD, FBXW7, NSD1, PDGFRA, AXIN2, TP53, SMARCA4, PTPRT, HGF, DOT1L, KDM5A, PTPRS, CREBBP, KMT2C, ATRX
 * Treatment history: CISPLATIN, ETOPOSIDE, CARBOPLATIN, INVESTIGATIONAL, INVESTIGATIONAL, NIVOLUMAB
Question: "Where did the cancer spread to, affecting the patient's health and treatment options?"
Options (choose ONLY one opt

In [24]:
pd.DataFrame(prompts).to_csv('../../data/dataset2_sin_os_status.csv', index=False)

In [6]:
bank = pd.read_csv('../data/questions_bank.csv')

In [None]:
bank

Unnamed: 0,versoin,variable,question
0,0,METASTATIC_SITES,What are the metastatic sites present in a can...
1,0,OS_MONTHS,What is the overall survival time (in months) ...
2,0,OS_STATUS,What is the overall survival status (alive or ...
3,0,CANCER_TYPE,What is the type of cancer that has been diagn...
4,0,TMB_nonsynonymous,What is the tumor mutation burden of non-synon...


In [34]:
filtro = qbank2[qbank2["variable"] == "CANCER_TYPE"]["question"].unique()

for i, pregunta in enumerate(filtro, start=1):
    print(f"{i}. {pregunta}")



1. What type of cancer was diagnosed in the patient?
2. - "What type of cancer was diagnosed in the patient?"
3. - "Which type of cancer was the patient diagnosed with?"
4. - "What type of cancer has the patient been diagnosed with?"
5. - "What type of cancer was found to have been diagnosed in the patient?"
6. "What type of cancer was diagnosed in the patient?"
7. - What type of cancer was diagnosed in the patient?
8. - What type of cancer was found in the patient?
9. What is the type of cancer diagnosed in the patient?
10. - "What is the type of cancer that the patient has been diagnosed with?"
11. What is the type of cancer that the patient has been diagnosed with?
12. - What type of cancer has the patient been diagnosed with?
13. - "What is the type of cancer diagnosed in the patient?"
14. - "What type of cancer did the patient have?"
15. - "What type of cancer has been diagnosed in the patient?"
16. - What is the type of cancer that the patient has?
17. "Which type of cancer was d

In [12]:
# Filtrar las preguntas donde variable sea "METASTATIC_SITES"
filtro = qbank2[qbank2["variable"] == "METASTATIC_SITES"]["question"].value_counts()

# Imprimir cada pregunta con su conteo e índice
for i, (pregunta, conteo) in enumerate(filtro.items(), start=1):
    print(f"{i}. {pregunta} ({conteo} veces)")


1. - "What are the metastatic sites found in the cancer patient's body?" (2 veces)
2. - "Which metastatic sites are present in this cancer patient?" (2 veces)
3. - "What are the metastatic sites found in the cancer patient?" (2 veces)
4. - What are the metastatic sites in the cancer patient? (2 veces)
5. - "What are the metastatic sites present in the cancer patient's body?" (2 veces)
6. - What are the metastatic sites present in the cancer patient? (2 veces)
7. - "What are the metastatic sites present in the cancer patient's body?" or 
  "Which parts of the patient's body have cancerous cells spread to, known as metastatic sites?" (1 veces)
8. - "Where are there other areas of cancer located in the body for this patient?" (1 veces)
9. - What are the metastatic sites where cancer has spread in the patient's body? (1 veces)
10. - "Where are the metastatic sites located in the patient's body?" or "What are the locations of metastasis in the cancer patient?" (1 veces)
11. - "What are the 

Create copy of bank of question with unique values only

In [18]:
qbank2['question'].value_counts()

question
- "What type of cancer was diagnosed in the patient?"                               10
- What type of cancer was diagnosed in the patient?                                  9
What is the tumor mutation burden of non-synonymous mutations for the patient?       3
- "How many months has the patient survived since their diagnosis?"                  3
"What is the tumor mutation burden of non-synonymous mutations for the patient?"     3
                                                                                    ..
- "Where did the cancer first start growing, known as the primary tumor site?"       1
- "Where are the metastases located in the patient's body?"                          1
- "Is the patient alive (OS_STATUS=Living)?"                                         1
- "What is the percent of malignant cells in the patient's tumor sample?"            1
- "What is the primary tumor site where the cancer originated?"                      1
Name: count, Length: 294, dtype: i

In [19]:
bank_unique = qbank2.copy()
bank_unique = bank_unique.drop_duplicates(subset=['variable', 'question']).reset_index(drop=True)
bank_unique['variable'].value_counts()

variable
Primary_Tumor_Site    48
Tumor_Purity          47
OS_MONTHS             46
METASTATIC_SITES      44
TMB_nonsynonymous     44
OS_STATUS             42
CANCER_TYPE           23
Name: count, dtype: int64

In [21]:
bank_unique.to_csv('../../data/questions_bank_unique.csv', index=False)

In [11]:
prompts_ = pd.read_csv('../data/dataset.csv')

In [16]:
prompts_.head(36)

Unnamed: 0,PATIENT_ID,variable,prompt,answer,replicate
0,P-0000012,METASTATIC_SITES,Instructions: You are provided with the follow...,A,0
1,P-0000012,OS_MONTHS,Instructions: You are provided with the follow...,D,0
2,P-0000012,OS_STATUS,Instructions: You are provided with the follow...,B,0
3,P-0000012,CANCER_TYPE,Instructions: You are provided with the follow...,A,0
4,P-0000012,TMB_nonsynonymous,Instructions: You are provided with the follow...,C,0
5,P-0000012,Tumor_Purity,Instructions: You are provided with the follow...,A,0
6,P-0000012,Primary_Tumor_Site,Instructions: You are provided with the follow...,B,0
7,P-0000012,METASTATIC_SITES,Instructions: You are provided with the follow...,C,1
8,P-0000012,OS_MONTHS,Instructions: You are provided with the follow...,D,1
9,P-0000012,OS_STATUS,Instructions: You are provided with the follow...,B,1


In [20]:
print(prompts_['prompt'].iloc[35])

Instructions: You are provided with the following patient data and will answer questions regarding the patient:
    Context: This is a cancer patient with the following characteristics:
     * Gender: Female
 * Mutations history: ALK, ESR1, TP53, RNF43, CDK4, GATA3, PIK3CA
 * Fraction Genome altered: 0.3503
 * History of PDL-1: Not available
 * Cancer stage: Stage 1-3
 * MSI Type: Stable
 * Mutation Count: 7.0
 * Number of tumor diagnoses (ICD-O codes): 1
 * Treatment history: ZOLEDRONIC ACID, PACLITAXEL, CARBOPLATIN, GEMCITABINE, MEGESTROL, VINORELBINE, LETROZOLE, CISPLATIN, EPIRUBICIN, FULVESTRANT, LETROZOLE, CAPECITABINE
 * Smoking history: Unknown
 * HER2 status: No
 * Age of the patient: 45.0
    Question: What are the metastatic sites in a cancer patient?
    Options: 
 A: Bone, Cns Brain, Liver, Lung, Lymph Nodes, Pleura
 B: Bone, Lung, Pleura
 C: Adrenal Glands, Bone, Lymph Nodes, Reproductive Organs
 D: Adrenal Glands, Bone, Cns Brain, Lung, Lymph Nodes, Pleura

    Answer: 
 

In [22]:
data['MUTATIONS']

Unnamed: 0,MUTATIONS
0,"PIK3C2G, FLT4, PIK3R3, BLM, MAP3K1, CCNE1, ATR..."
1,"ALK, ESR1, TP53, RNF43, CDK4, GATA3, PIK3CA"
2,"ERBB2, NOTCH4, IRS1, TP53, FBXW7, AR, TSHR"
3,"INPP4B, PIK3C2G, ERBB4, FLT4, NF2, NOTCH4, TP5..."
4,"CD276, TP53, ESR1"
...,...
23772,"SPOP, KMT2A"
23773,"PALB2, RASA1, PBRM1, DDR2, IKBKE, CDH1, FLT3, ..."
23774,"KRAS, APC, MRE11A, TET2, ESR1, TP53, PIK3CB, K..."
23775,"SF3B1, GATA3, PIK3CA"


In [23]:

# Supongamos que tu DataFrame se llama `df`
# Paso 1: Convertir cada fila en una lista de genes
all_genes = data["MUTATIONS"].dropna().str.split(",\s*")

# Paso 2: Aplanar todas las listas y crear un set para quedarnos con únicos
unique_genes = set(gene for sublist in all_genes for gene in sublist)

# Paso 3: Contar cuántos genes únicos hay
print(f"Número de genes únicos mutados: {len(unique_genes)}")


Número de genes únicos mutados: 524


## Prompts for instruct model. Llama 3-8B
A little different from the previous one

In [28]:
variables = ['METASTATIC_SITES', 'OS_MONTHS', 'OS_STATUS', 'CANCER_TYPE', 'TMB_nonsynonymous', 'Tumor_Purity', 'Primary_Tumor_Site']
prompts = []
options_id = ['A', 'B', 'C', 'D']
for ix, i in tqdm(data.iterrows()):
    for replicate in range(5):
        for variable in variables:

            context_variables = [
                ['Mutations history', i.MUTATIONS],
                ['Age of the patient', i.CURRENT_AGE_DEID],
                ['Treatment history', i.TREATMENT_HISTORY],
                ['HER2 status', i.HER2],
                ['Cancer stage', i.STAGE_HIGHEST_RECORDED],
                ['Gender', i.GENDER],
                ['Smoking history', i.SMOKING_PREDICTIONS_3_CLASSES],
                ['History of PDL-1', i.HISTORY_OF_PDL1],
                ['Fraction Genome altered', i.Fraction_Genome_Altered],
                ['MSI Type', i.MSI_Type],
                ['Mutation Count', i.Mutation_Count],
                ['Number of tumor diagnoses (ICD-O codes)', i.NUM_ICDO_DX]
            ]

            context_variables = [f"{i[0]}: {i[1]}" for i in context_variables]
            np.random.shuffle(context_variables)
            context_variables[0] = " * "+context_variables[0]
            context_variables = "\n * ".join(context_variables)

            context = f"""Instruction: You are a helpful medical assistant. 
Based on the following patient data, answer the multiple-choice question by selecting the correct letter (A, B, C, or D).
Context: This is a cancer patient with the following characteristics:
{context_variables}"""

            question = qbank2.query(f'variable == "{variable}"').sample(1).question.values[0]

            options = get_options_from_distribution(variable)
            answer = i[variable]

            options = list(options[options != answer][:3]) + [answer]
            np.random.shuffle(options)
            index_option = np.argmax(np.array(options) == answer)
            options = "".join([f' {i}: {j}\n' for i,j in zip(options_id, options)])
            answer = options_id[index_option]

            text = f"""{context}
Question: {question}
Options (choose one letter): \n{options}
    Answer:
    """

            prompts.append(dict(
                PATIENT_ID=i.PATIENT_ID,
                variable=variable,
                prompt=text,
                answer=answer,
                replicate=replicate
            ))

23777it [06:45, 58.65it/s]


In [29]:
print(prompts[0]['prompt'])

Instruction: You are a helpful medical assistant. 
Based on the following patient data, answer the multiple-choice question by selecting the correct letter (A, B, C, or D).
Context: This is a cancer patient with the following characteristics:
 * Age of the patient: 68.0
 * Treatment history: CISPLATIN, ETOPOSIDE, CARBOPLATIN, INVESTIGATIONAL, INVESTIGATIONAL, NIVOLUMAB
 * History of PDL-1: No
 * Gender: Female
 * Cancer stage: Stage 1-3
 * Smoking history: Former/Current Smoker
 * MSI Type: Indeterminate
 * Mutation Count: 1.0
 * HER2 status: No
 * Mutations history: PIK3C2G, FLT4, PIK3R3, BLM, MAP3K1, CCNE1, ATR, BRIP1, TSHR, SDHA, RET, PTPRD, FBXW7, NSD1, PDGFRA, AXIN2, TP53, SMARCA4, PTPRT, HGF, DOT1L, KDM5A, PTPRS, CREBBP, KMT2C, ATRX
 * Fraction Genome altered: 0.3146
 * Number of tumor diagnoses (ICD-O codes): 2
Question: - "What are the metastatic sites in a cancer patient?"
Options (choose one letter): 
 A: Adrenal Glands, Bone, Cns Brain, Liver, Lung, Lymph Nodes, Reproductive

In [30]:
import json

with open("../../data/llama3_finetune_dataset.jsonl", "w") as f:
    for row in prompts:
        prompt_text = row["prompt"].strip()
        answer_text = row["answer"].strip()

        full_example = f"""<|begin_of_text|>
<|system|>
You are a helpful medical assistant.
<|user|>
{prompt_text}
<|assistant|>
{answer_text}
<|end_of_text|>"""

        json.dump({"text": full_example}, f)
        f.write("\n")


In [31]:
pd.DataFrame(prompts).to_csv('../../data/prompts-instruct-model.csv', index=False)