## Import libraties

In [1]:
import pandas as pd
import os
import time
import json
from helpers import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip3 install -U openai

### Open AI Setting

In [None]:
# Set OpenAI API Key
api_key = ''
openai_models = ['gpt-4o', 'gpt-4o-mini', 'o1-mini']

### Deepseek AI Setting

In [None]:
# Deepseek API
ds_api_key = ""
deepseek_models = ['deepseek-reasoner', 'deepseek-chat']

## Read Data

In [5]:
# Params
context_path = 'context(en).txt'
# switch model here:
# ollama:
    # mistral-nemo:12b-instruct-2407-fp16, , 
    # llama3.1:8b-instruct-fp16, llama3.1:70b-instruct-q4_0, qwen2.5:7b-instruct-fp16, llama3.3:70b-instruct-q4_0, marco-o1:7b-fp16
    # deepseek-r1:70b, deepseek-r1:70b-llama-distill-q4_K_M, deepseek-r1:32b, deepseek-r1:8b-llama-distill-fp16
# Open AI:
    # gpt-4o, gpt-4o-mini, o1-mini, o1-preview <- too expensive :(
# Deepseek:
    # deepseek-reasoner, deepseek-chat

model = 'deepseek-reasoner'
ds_type = "test"
input_path = f"../radnlp_2024_train_val_20240731/en/main_task/{ds_type}/"
csv_file_name = 'label.csv'
if ds_type == "test":
    csv_file_name = 'sample_submission.csv'

In [None]:
data = pd.read_csv(input_path + csv_file_name, index_col=False)
data.head()

Unnamed: 0,id,t,n,m
0,467592,T0,N0,M0
1,612334,T0,N0,M0
2,1062264,T0,N0,M0
3,1112974,T0,N0,M0
4,1500370,T0,N0,M0


In [7]:
with open(context_path, 'r', encoding="utf-8") as file:
    context = file.read()

## Classification

### Prompt

In [8]:
system_prompt = f"""You are a seasoned oncologist specializing in cancer classification. 
Your task is to determine the TNM classification based solely on the provided radiology report.
---
TNM Classification System Overview:
- T (Tumor): Size and extent of the primary tumor.
- N (Node): Whether the cancer has spread to nearby lymph nodes.
- M (Metastasis): Whether the cancer has spread to distant parts of the body.

{context}
---
Instructions:
- Use only the information from the radiology report.
- Provide your classification strictly in the specified JSON format without any additional text.
- Ensure each classification is specific to the categories provided.
- Include an additional key "Reason" in the JSON output explaining the rationale behind your TNM classification.
---
Classification Options:
- T: {{T0, Tis, T1mi, T1a, T1b, T1c, T2a, T2b, T3, T4}}
  - Note: "T1" should be broken down into T1mi, T1ss, T1a, T1b, or T1c.
  - Note: "T2" should be specified as either T2a or T2b.
  - Note: Use T0 if the tumor does not fit into Tis to T4.
- N: {{N0, N1, N2, N3}}
- M: {{M0, M1a, M1b, M1c}}
  - Note: "M1" must be specified as M1a, M1b, or M1c; there is no "M1" category.
---
Example Output:
{{
  "T": "T2a",
  "N": "N1",
  "M": "M0"
  "Reason": "Your explanation for the TNM classification."
}}
"""

In [9]:
def build_user_prompt(report: str):
    user_prompt = f"""Please derive the TNM classification based on the following radiology report. 
Use only the information provided and ensure that the classification follows the format below without any additional text.
---
Radiology Report:
{report}

---
Classification Format (choose from the specified options):
{{
  "T": "T?",
  "N": "N?",
  "M": "M?",
  "Reason": "Your explanation for the TNM classification."
}}
Classification Options:
- T: {{T0, Tis, T1mi, T1a, T1b, T1c, T2a, T2b, T3, T4}}
  - Note: There is no "T1" or "T2" without subcategories.
- N: {{N0, N1, N2, N3}}
- M: {{M0, M1a, M1b, M1c}}
  - Note: There is no "M1" without subcategories.
"""
    return user_prompt
    

In [10]:
# messages = [{'role': 'user', 'content': "Hello"}]
# response = deepseek(model, messages, ds_api_key)
# response = o1(model, messages, api_key)
# print(response)

### Main Loop

In [11]:
num_files = data.shape[0]
result = []
start_time = time.time()

for i in range(num_files):
    rID = data.iloc[i]['id']
    file_name = f"{rID}.txt"

    report = get_input_text(input_path, file_name)
    # convert cm to mm
    report = cm_to_mm(report)
    
    # Set retry limit and attempt counter
    max_retries = 999999
    retries = 0
    success = False

    while not success and retries < max_retries:
        try:
            messages = [
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': build_user_prompt(report)}
            ]
                    
            # LLM generate response
            if model in openai_models:
                if model in ['o1-mini', 'o1-preview']:
                    messages = [{'role': 'user', 'content': system_prompt + build_user_prompt(report)}]
                    response = o1(model, messages, api_key)
                else:
                    response = GPT(model, messages, api_key)
                
                # Wait for 1 minute after processing 15 samples(openai limit)
                if (i + 1) % 15 == 0:
                    print("Processed 15 samples, waiting for 1 minute...")
                    time.sleep(60) 
            elif model in deepseek_models:
                response = deepseek(model, messages, ds_api_key)
            else:
                response = LLM(model, messages)
            
            # Print case information
            print(f"Case rID: {rID}")
            print("Response: ", response)
            print('--------------------------------------------------------------')
            
            # Create the combined JSON
            combined_json = {
                "id": rID,
                **get_json(response), 
                "input_text": report,
                "model_response": response,
                "model": model
            }

            print(combined_json)
            print('--------------------------------------------------------------')
            result.append(combined_json)
            success = True
            
        except Exception as e:
            retries += 1
            print(f"Error for rID {rID}: {str(e)}. Retry {retries}/{max_retries}...")

    if not success:
        print(f"Failed to process rID {rID} after {max_retries} attempts.")

df_result = pd.DataFrame(result)
df_result.columns = df_result.columns.str.lower()
df_result.to_csv(f"./prediction/ensemble/{ds_type}/reason/{model.replace(':', '-')}_{ds_type}_03.csv", index=False)

df_result = df_result[['id', 't', 'n', 'm']]
df_result.to_csv(f"./prediction/ensemble/{ds_type}/{model.replace(':', '-')}_{ds_type}_03.csv", index=False)

print()
print("--- %s seconds ---" % (time.time() - start_time))

Case rID: 467592
Response:  {
  "T": "T4",
  "N": "N1",
  "M": "M0",
  "Reason": "T4 is assigned due to suspected lesions in different ipsilateral lobes (right middle and lower lobes). N1 corresponds to the right hilar lymph node enlargement, indicating ipsilateral hilar involvement. M0 is chosen as the adrenal nodule is suggestive of a benign adenoma and lacks confirmation of metastasis, while the hepatic cyst is considered non-malignant."
}
--------------------------------------------------------------
{'id': 467592, 'T': 'T4', 'N': 'N1', 'M': 'M0', 'Reason': 'T4 is assigned due to suspected lesions in different ipsilateral lobes (right middle and lower lobes). N1 corresponds to the right hilar lymph node enlargement, indicating ipsilateral hilar involvement. M0 is chosen as the adrenal nodule is suggestive of a benign adenoma and lacks confirmation of metastasis, while the hepatic cyst is considered non-malignant.', 'input_text': 'Infiltrative shadows in the right middle lobe and gr