In [2]:
import pandas as pd
import numpy as np
import random
from autocorrect import Speller
import re
from LLM import LLM
from PromptBuilder import PromptBuilder
from metadata import language_registers, discussion_tones

2025-01-28 20:14:57.071123: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-28 20:14:57.081745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738091697.095287   41287 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738091697.099636   41287 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-28 20:14:57.113183: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
CTCAE_database = pd.read_excel('CTCAE_v5.0.xlsx')
CTCAE_symptoms = CTCAE_database['CTCAE Term'].values
Grades = ['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 5']
CTCAE_database.head()

Unnamed: 0,MedDRA Code,MedDRA SOC,CTCAE Term,Grade 1,Grade 2,Grade 3,Grade 4,Grade 5,Definition,Navigational Note,CTCAE v5.0 Change
0,10002272,Blood and lymphatic system disorders,Anemia,Hemoglobin (Hgb) <LLN - 10.0 g/dL; <LLN - 6.2 ...,Hgb <10.0 - 8.0 g/dL; <6.2 - 4.9 mmol/L; <100 ...,Hgb <8.0 g/dL; <4.9 mmol/L; <80 g/L; transfusi...,Life-threatening consequences; urgent interven...,Death,A disorder characterized by a reduction in the...,,Clarification: Definition
1,10005329,Blood and lymphatic system disorders,"Blood and lymphatic system disorders - Other, ...",Asymptomatic or mild symptoms; clinical or dia...,"Moderate; minimal, local or noninvasive interv...",Severe or medically significant but not immedi...,Life-threatening consequences; urgent interven...,Death,-,,Clarification: Grade 3
2,10048580,Blood and lymphatic system disorders,Bone marrow hypocellular,Mildly hypocellular or <=25% reduction from no...,Moderately hypocellular or >25 - <50% reductio...,Severely hypocellular or >50 - <=75% reduction...,Aplastic persistent for longer than 2 weeks,Death,A disorder characterized by the inability of t...,,
3,10013442,Blood and lymphatic system disorders,Disseminated intravascular coagulation,-,Laboratory findings with no bleeding,Laboratory findings and bleeding,Life-threatening consequences; urgent interven...,Death,A disorder characterized by systemic pathologi...,,
4,10014950,Blood and lymphatic system disorders,Eosinophilia,>ULN and >Baseline,-,Steroids initiated,-,-,A disorder characterized by laboratory test re...,,Addition: Term


### Llama 8B instruct 

In [5]:
import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig


class LLM:
    def __init__(self, 
                max_length: int = 300,
                model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"):
        
        """
        Initialize the LLM model and its tokenizer.

        :param max_length: (int) Maximum length of the generated text (default: 300).
        :param model_name: (str) Name of the model on Hugging Face Hub.
        """

        self.token = json.load(open("config.json"))["TOKEN"]

        if not self.token:
            raise ValueError("Authentication token is missing. Set it in the environment or config.json.")

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=self.token)

        self.tokenizer.pad_token = self.tokenizer.eos_token or self.tokenizer.unk_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            device_map="auto", 
            quantization_config=bnb_config, 
            use_auth_token=self.token
        )

        self.max_length = max_length

        self.generate_kwargs = {
                "do_sample": True,
                "temperature": 0.7,
                "max_new_tokens": 200,
                "eos_token_id": self.tokenizer.eos_token_id,
                "top_p": 0.9, 
                "repetition_penalty": 1.2  # Penalize repetitive phrases
            }

        self.text_generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_new_tokens=self.max_length,
        )

        print(f"Model loaded on device(s): {self.model.hf_device_map}")


    def generate_text(self, messages: list) -> str:
        """
        Generate text based on a list of messages (conversational format).

        :param messages: (list) A list of dictionaries containing conversation history.
                         Example: [{"role": "system", "content": "You are..."},
                                   {"role": "user", "content": "Hello!"}]
        :return: (str) Generated text response from the model.
        """

        outputs = self.text_generator(messages)

        # Extract the generated text
        generated_text = outputs[0]["generated_text"]
        return generated_text[-1]['content']

In [8]:
model = LLM()



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [8]:
def build_prompt(
    symptoms: list[str], 
    is_grade: bool, 
    grades: list[str], 
    grades_description: list[str], 
    detail_level: float, 
    enumeration: bool = False, 
    explicit_symptom: bool = False, 
    language_style: str = "everyday language", 
    spelling_errors: bool = False, 
    tone: str = "neutral"
) -> list[dict]:
    """
    Construct a prompt based on the given parameters.
    """
    if not isinstance(symptoms, list) or not all(isinstance(s, str) for s in symptoms):
        raise ValueError("symptoms must be a list of strings.")
    
    if is_grade:
        if not isinstance(grades, list) or not isinstance(grades_description, list):
            raise ValueError("grades and grades_description must be lists.")

    messages = [
        {
            "role": "system",
            "content": f"You are a chatbot that writes in the voice of a patient using a {language_style} register with a {tone} tone."
        },
        {
            "role": "user",
            "content": (
                f"Write a phrase from a sick patient describing their symptoms. The symptoms the patient describes are: {', '.join(symptoms)}. "
                f"The patient speaks in {language_style} Register and uses a {tone} tone. "
            )
        }
    ]
    
    if is_grade:
        messages[1]['content'] += (
            f"The grades of the symptoms correspond to a scale {grades} (in the same order as the symptoms list, "
            f"where 1 corresponds to the least severe and 5 to the most severe). These grades correspond to: {grades_description}. "
        )

    # Specify the level of detail
    detail_instructions = {
        1: "The description should be very brief with minimal details.",
        2: "The description should be brief with some basic details.",
        3: "The description should provide a moderate level of detail.",
        4: "The description should be detailed and thorough.",
        5: "The description should be very detailed and comprehensive."
    }
    messages[1]['content'] += detail_instructions.get(detail_level, "Provide a moderate level of detail.") + " "

    if enumeration:
        messages[1]['content'] += (
            "The patient should just enumerate the descriptions. For example, for symptom Anemia, a sentence could be: "
            "'Fatigue and Weakness, Pale Skin, Shortness of Breath, Dizziness, Irregular Heartbeat, Chest Pain, Cold Hands and Feet, Headaches'. "
        )

    if explicit_symptom:
        messages[1]['content'] += f"The patient should explicitly mention the symptoms they have (e.g., {', '.join(symptoms)}). "

    if spelling_errors:
        messages[1]['content'] += " Include spelling mistakes in the generated phrase."

    messages[1]['content'] += " Respond with only the sentence in the patient's voice, enclosed in double quotes, without any additional text, comments, or notes."

    return messages


In [28]:
df.to_csv("building_dataset_with_llama8B.csv")

In [6]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B")



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [10]:
data = []

for symptom in CTCAE_symptoms[:100]:

    symptoms = [symptom] 
    CTCAE_symptoms_copy = list(CTCAE_symptoms.copy())

    CTCAE_symptoms_copy.remove(symptom) 

    while True:  

        if np.random.choice([True, False], p=[0.3, 0.7]):
            
            if CTCAE_symptoms_copy: 
                symptom2 = np.random.choice(CTCAE_symptoms_copy)
                symptoms.append(symptom2)
                CTCAE_symptoms_copy.remove(symptom2)
            else:
                break 
        else:
            break 

    is_grade = np.random.choice([True, False], p=[0.8, 0.2])

    grades = [random.choice(Grades) for _ in range(len(symptoms))]

    grades_description = [ CTCAE_database[CTCAE_database['CTCAE Term'] == symptom][grade].values[0] for symptom,grade in zip(symptoms,grades)]
    
    detail_level = np.random.choice([1,2,3,4,5])

    enumeration = np.random.choice([True, False], p=[0.2, 0.8])

    explicit_symptom = np.random.choice([True, False], p=[0.2, 0.8])

    language_style = random.choice(language_registers)['name']

    tone = random.choice(discussion_tones)['name']

    spelling_errors = random.choice([True, False])

    prompt = build_prompt(symptoms, is_grade, grades, grades_description, detail_level, enumeration, explicit_symptom, language_style, spelling_errors, tone) 
    

    phrase_generated = model.generate_text(messages = prompt)

    data.append([symptoms, phrase_generated, language_style,tone,detail_level, enumeration, explicit_symptom, spelling_errors, prompt])

df = pd.DataFrame(data, columns=["Symptoms", "Dialogue_Generated", "Language_style", "Tone", "Detail_level", "Enumeration", "Explicit_symptom", "Spelling_errors", "Prompt"])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [13]:
df

Unnamed: 0,Symptoms,Dialogue_Generated,Language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors,Prompt
0,[Anemia],"""My blood is as dry as the desert sand, and m...",Poetic/Literary Register,Fearful,4,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
1,"[Blood and lymphatic system disorders - Other,...","""I am experiencing a severe Blood and lymphat...",Formal Register,Insulting,2,False,True,True,"[{'role': 'system', 'content': 'You are a chat..."
2,"[Bone marrow hypocellular, Hypernatremia]","""I've been feeling really weak lately and I'v...",Informal Register,Friendly,4,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
3,[Disseminated intravascular coagulation],"""I've got this weird feeling in my body, like...",Informal Register,Confused,2,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
4,[Eosinophilia],"""I have a severe Eosinophilia.""",Formal Register,Angry,1,False,True,False,"[{'role': 'system', 'content': 'You are a chat..."
...,...,...,...,...,...,...,...,...,...
95,[Abdominal distension],"""My belly is a bloated balloon, a distended d...",Poetic/Literary Register,Insulting,1,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
96,"[Abdominal pain, Eye pain]","""I am experiencing abdominal pain and eye pai...",Formal Register,Insulting,4,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
97,[Anal fissure],"""I've got a bloody arsehole, it's so sore I c...",Informal Register,Insulting,4,False,False,True,"[{'role': 'system', 'content': 'You are a chat..."
98,[Anal fistula],"""My bottom has a small hole that won't heal.""",Poetic/Literary Register,Friendly,1,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."


### Example : 

In [19]:
df['Prompt'][79]

[{'role': 'system',
  'content': 'You are a chatbot that writes in the voice of a patient using a Formal Register register with a Fearful tone.'},
 {'role': 'user',
  'content': "Write a phrase from a sick patient describing their symptoms. The symptoms the patient describes are: Glaucoma, Intraoperative renal injury. The patient speaks in Formal Register Register and uses a Fearful tone. The grades of the symptoms correspond to a scale ['Grade 4', 'Grade 1'] (in the same order as the symptoms list, where 1 corresponds to the least severe and 5 to the most severe). These grades correspond to: ['Visual field deficit within the central 10 degrees of the visual field in the affected eye', 'Primary repair of injured organ/structure indicated']. The description should be very detailed and comprehensive.  Include spelling mistakes in the generated phrase. Respond with only the sentence in the patient's voice, enclosed in double quotes, without any additional text, comments, or notes."}]

In [18]:
df['Dialogue_Generated'][79]

' "I have been experiencing some vision problems lately, and I was told that I have glaucoma. I am very concerned about this because I have heard that it can lead to blindness. I also had a kidney injury during a recent surgery, and I was told that I have intraoperative renal injury. I am very worried about this because I have heard that it can lead to chronic kidney disease. I am hoping that you can help me with this problem."'

In [20]:
df_1 = df

In [21]:
data = []

for symptom in CTCAE_symptoms[100:200]:

    symptoms = [symptom] 
    CTCAE_symptoms_copy = list(CTCAE_symptoms.copy())

    CTCAE_symptoms_copy.remove(symptom) 

    while True:  

        if np.random.choice([True, False], p=[0.3, 0.7]):
            
            if CTCAE_symptoms_copy: 
                symptom2 = np.random.choice(CTCAE_symptoms_copy)
                symptoms.append(symptom2)
                CTCAE_symptoms_copy.remove(symptom2)
            else:
                break 
        else:
            break 

    is_grade = np.random.choice([True, False], p=[0.8, 0.2])

    grades = [random.choice(Grades) for _ in range(len(symptoms))]

    grades_description = [ CTCAE_database[CTCAE_database['CTCAE Term'] == symptom][grade].values[0] for symptom,grade in zip(symptoms,grades)]
    
    detail_level = np.random.choice([1,2,3,4,5])

    enumeration = np.random.choice([True, False], p=[0.2, 0.8])

    explicit_symptom = np.random.choice([True, False], p=[0.2, 0.8])

    language_style = random.choice(language_registers)['name']

    tone = random.choice(discussion_tones)['name']

    spelling_errors = random.choice([True, False])

    prompt = build_prompt(symptoms, is_grade, grades, grades_description, detail_level, enumeration, explicit_symptom, language_style, spelling_errors, tone) 
    

    phrase_generated = model.generate_text(messages = prompt)

    data.append([symptoms, phrase_generated, language_style,tone,detail_level, enumeration, explicit_symptom, spelling_errors, prompt])

df_2 = pd.DataFrame(data, columns=["Symptoms", "Dialogue_Generated", "Language_style", "Tone", "Detail_level", "Enumeration", "Explicit_symptom", "Spelling_errors", "Prompt"])

In [25]:
df_gen_biollama = pd.concat([df_1, df_2])

In [28]:
df_gen_biollama = df_gen_biollama.reset_index(drop=True)

In [30]:
df_gen_biollama.to_csv("building_dataset_with_biollama8B.csv")

### Data generated :

In [5]:
with_biollama = pd.read_csv('building_dataset_with_biollama8B.csv')
with_biollama.head(5)

Unnamed: 0.1,Unnamed: 0,Symptoms,Dialogue_Generated,Language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors,Prompt
0,0,['Anemia'],"""My blood is as dry as the desert sand, and m...",Poetic/Literary Register,Fearful,4,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
1,1,['Blood and lymphatic system disorders - Other...,"""I am experiencing a severe Blood and lymphat...",Formal Register,Insulting,2,False,True,True,"[{'role': 'system', 'content': 'You are a chat..."
2,2,"['Bone marrow hypocellular', 'Hypernatremia']","""I've been feeling really weak lately and I'v...",Informal Register,Friendly,4,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
3,3,['Disseminated intravascular coagulation'],"""I've got this weird feeling in my body, like...",Informal Register,Confused,2,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
4,4,['Eosinophilia'],"""I have a severe Eosinophilia.""",Formal Register,Angry,1,False,True,False,"[{'role': 'system', 'content': 'You are a chat..."


In [6]:
with_llama = pd.read_csv('building_dataset_with_llama8B.csv')
with_llama.head(5)

Unnamed: 0.1,Unnamed: 0,Symptoms,Dialogue_Generated,Language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors,Prompt
0,0,['Anemia'],"""Alas, my weary soul, beset by the weight of a...",Poetic/Literary Register,Friendly,2,False,True,False,"[{'role': 'system', 'content': 'You are a chat..."
1,1,['Blood and lymphatic system disorders - Other...,"""Alas, my pitiful physician, I'm beset by the ...",Poetic/Literary Register,Insulting,4,False,True,False,"[{'role': 'system', 'content': 'You are a chat..."
2,2,['Bone marrow hypocellular'],"""I'm not really sure what's going on, but my d...",Neutral/Standard Register,Confused,2,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
3,3,['Disseminated intravascular coagulation'],"""Great, just what I needed, a lovely case of D...",Neutral/Standard Register,Insulting,4,False,True,False,"[{'role': 'system', 'content': 'You are a chat..."
4,4,['Eosinophilia'],"""Oh, the terror that grips my fragile form, as...",Poetic/Literary Register,Fearful,3,False,False,False,"[{'role': 'system', 'content': 'You are a chat..."
