In [2]:
import pandas as pd
import numpy as np
import random
from autocorrect import Speller
import re
from LLM import LLM
from PromptBuilder import PromptBuilder
from metadata import language_registers, discussion_tones

### Biollama :

In [3]:
model = LLM("iRASC/BioLlama-Ko-8B")



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


### PRO dataset :

In [4]:
a = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
a.head(4)

Unnamed: 0,NCIt Code of Subset,Subset Name,NCIt Concept Code,NCIt PT,NCIt Definition,PRO-CTCAE PT,Has PRO-CTCAE Attribute Code,Has PRO-CTCAE Attribute PT,PRO-CTCAE Attribute Has Value Code,PRO-CTCAE Value PT
0,C205862,PRO-CTCAE Oral Adverse Event Terminology,C205929,Cracking at the Corners of the Mouth (Cheilosi...,Cheilosis or cheilitis as recorded on the PRO-...,Cracking at the corners of the mouth (cheilosi...,C25676,Severity,,
1,C205862,PRO-CTCAE Oral Adverse Event Terminology,C205927,"Difficulty Swallowing, PRO-CTCAE",Problems with swallowing as recorded on the PR...,Difficulty Swallowing,C25676,Severity,,
2,C205862,PRO-CTCAE Oral Adverse Event Terminology,C205926,"Dry Mouth, PRO-CTCAE",Dryness of the oral mucosa secondary to a decr...,Dry Mouth,C25676,Severity,,
3,C205862,PRO-CTCAE Oral Adverse Event Terminology,C205931,"Hoarseness, PRO-CTCAE",A raspy alteration in one's voice as recorded ...,Hoarseness,C25676,Severity,,


In [5]:
index = (a [a['PRO-CTCAE PT'] == 'Pain and swelling at injection site']).index.values[0] +1 
symptoms = a.iloc[:index]['PRO-CTCAE PT'].values
symptoms

array(['Cracking at the corners of the mouth (cheilosis/cheilitis)',
       'Difficulty Swallowing', 'Dry Mouth', 'Hoarseness',
       'Mouth/throat sores', 'Voice quality changes', 'Abdominal pain',
       'Bloating', 'Constipation', 'Decreased appetite', 'Diarrhea',
       'Fecal incontinence', 'Gas', 'Heartburn', 'Hiccups', 'Nausea',
       'Taste Changes', 'Vomiting', 'Coughing', 'Shortness of breath',
       'Wheezing', 'Heart palpitations', 'Swelling (arms or legs)',
       'Acne', 'Bed/pressure sores', 'Hair loss',
       'Hand-foot syndrome (a rash of the hands and feet that can cause cracking, peeling, redness or pain)',
       'Hives', 'Itching', 'Nail discoloration', 'Nail loss',
       'Nail ridging', 'Radiation skin reaction', 'Rash',
       'Sensitivity to sunlight', 'Skin darkening', 'Skin dryness',
       'Stretch marks', 'Dizziness', 'Numbness & tingling',
       'Blurred vision', 'Flashing lights', 'Ringing in ears',
       'Visual floaters', 'Watery eyes', 'Concentra

In [6]:
Dict = {}
for symptom in symptoms : 
    try:
        Dict[symptom] = {}
        descriptions_code = a[a['PRO-CTCAE PT'] == symptom]['Has PRO-CTCAE Attribute Code'].values[0].split(" || ")
        descriptions = a [a['PRO-CTCAE PT'] == symptom]['Has PRO-CTCAE Attribute PT'].values[0].split(" || ")
        for description in descriptions :
            Dict[symptom][description]  = a[a['PRO-CTCAE PT'] == description  ]['PRO-CTCAE Value PT'].values[0].split(" || ")
    
    except Exception as e :
        print(e)

'float' object has no attribute 'split'


In [7]:
Dict

{'Cracking at the corners of the mouth (cheilosis/cheilitis)': {'Severity': ['Very severe',
   'Not sexually active',
   'Prefer not to answer',
   'None',
   'Not applicable',
   'Moderate',
   'Mild',
   'Severe']},
 'Difficulty Swallowing': {'Severity': ['Very severe',
   'Not sexually active',
   'Prefer not to answer',
   'None',
   'Not applicable',
   'Moderate',
   'Mild',
   'Severe']},
 'Dry Mouth': {'Severity': ['Very severe',
   'Not sexually active',
   'Prefer not to answer',
   'None',
   'Not applicable',
   'Moderate',
   'Mild',
   'Severe']},
 'Hoarseness': {'Severity': ['Very severe',
   'Not sexually active',
   'Prefer not to answer',
   'None',
   'Not applicable',
   'Moderate',
   'Mild',
   'Severe']},
 'Mouth/throat sores': {'Interference (with daily activities)': ['Not at all',
   'A little bit',
   'Somewhat',
   'Quite a bit',
   'Very much'],
  'Severity': ['Very severe',
   'Not sexually active',
   'Prefer not to answer',
   'None',
   'Not applicable',

In [20]:
import itertools

prompt_builder = PromptBuilder()

data = []
c = 0
for symptom, descriptions in Dict.items():

    # Generate all (description, meta) combinations
    description_meta_combinations = list(itertools.product(descriptions.keys(), *descriptions.values()))

    for description, *meta_combinations in description_meta_combinations:

        for meta_set in meta_combinations:

            if c % 100 == 0:
                print(f"Generated {c+100} sentences...")
        
            detail_level = np.random.choice([1, 2, 3, 4, 5])
                        
            enumeration = np.random.choice([True, False], p=[0.2, 0.8])

            explicit_symptom = np.random.choice([True, False], p=[0.2, 0.8])


            language_style = random.choice(language_registers)['name']
            tone = random.choice(discussion_tones)['name']
            spelling_errors = random.choice([True, False])

            prompt = prompt_builder.build_prompt(
                symptoms=[symptom],
                description=description,
                meta=meta_set,
                detail_level=detail_level,
                enumeration=enumeration,
                explicit_symptom=explicit_symptom,
                language_style=language_style,
                spelling_errors=spelling_errors,
                tone=tone
            )
            
            phrase_generated = model.generate_text(messages=prompt)

            data.append([
                phrase_generated, symptom, description, meta_set, language_style, 
                tone, detail_level, enumeration, explicit_symptom, spelling_errors
            ])
            c += 1

df = pd.DataFrame(data, columns=[
    "Dialogue_Generated", "Symptom", "Description", "Meta", 
    "Language_Style", "Tone", "Detail_Level", "Enumeration", 
    "Explicit_Symptom", "Spelling_Errors"
])

Generated 100 sentences...
Generated 200 sentences...
Generated 300 sentences...
Generated 400 sentences...


KeyboardInterrupt: 

In [21]:
df = pd.DataFrame(data, columns=["Dialogue_Generated", "symptom", "description", "meta", "language_style", "Tone", "Detail_level", "Enumeration", "Explicit_symptom", "Spelling_errors"])

In [11]:
df['Dialogue_Generated'].iloc[12]

' "I can\'t swallow nothin\' proper like. It\'s like somethin\' is stuck in my throat all the time."'

In [12]:
df['Dialogue_Generated'].iloc[0]

' "I got these little cracks at the corners of my mouth and I don\'t know what to do with them. They are so painful and itchy and I can\'t even eat properly because of them. I tried to apply some lip balm but it didn\'t help at all. I also tried to apply some petroleum jelly but it didn\'t help either. I even tried to apply some vitamin E oil but it didn\'t help at all. I don\'t know what to do with these cracks. I am so frustrated and I don\'t know what to do."'

In [13]:
df['Dialogue_Generated'].iloc[1]

' "I got these cracks at the corners of me mouth, an\' it\'s been goin\' on fer months now. I tried everythin\' I could find on the internet, but nothin\' seems ta work. It\'s gettin\' so bad that I can\'t even kiss me boyfriend no more."'

In [22]:
df.to_csv('New_generated_dataset_with_biollama8B_3.csv')