In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model_id = "BioMistral/BioMistral-7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)

In [None]:
#Infer RGCIC Data
import pandas as pd
import os
import numpy as np
import torch
from datasets import Dataset

classified_krishna_df = pd.read_csv('/home/necuser/sdp/Classification/LLM/Finetuining/data/train/processed_chunked_files_llm_finetuining_krishna.csv')
classified_sandhya_df = pd.read_csv('/home/necuser/sdp/Classification/LLM/Finetuining/data/train/processed_chunked_files_llm_finetuining_sandhya.csv')

classified_df = pd.concat([classified_sandhya_df, classified_krishna_df], axis=0, ignore_index=True)
dataset = Dataset.from_pandas(classified_df)
dataset = dataset.train_test_split(0.1)

In [3]:
#Infer Shrishti Data
import pandas as pd
import os
import numpy as np
import torch
from datasets import Dataset

classified_shrishti_df = pd.read_csv('/home/jupyter/finetune_LLM/Data/shrishti/train/processed_chunked_files_llm_finetuining_shrishti.csv')

classified_df = classified_shrishti_df
dataset = Dataset.from_pandas(classified_df)
dataset = dataset.train_test_split(0.1)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['filename', 'notes', 'classified'],
        num_rows: 9468
    })
    test: Dataset({
        features: ['filename', 'notes', 'classified'],
        num_rows: 1053
    })
})

In [5]:
dataset['test']['filename'][0]

'Notes_text_shrishti_chunk_7024.txt'

In [None]:
# Find the index of the target string - RGCIC
target_string = "Notes_text_7202_chunk_1.txt"
string_list = dataset["test"]["filename"]
if target_string in string_list:
    index = string_list.index(target_string)
    print(f"The index of '{target_string}' is: {index}")
else:
    print(f"The string '{target_string}' is not in the list.")

In [6]:
# Find the index of the target string - Shrishti
target_string = "Notes_text_shrishti_chunk_7024.txt"
string_list = dataset["test"]["filename"]
if target_string in string_list:
    index = string_list.index(target_string)
    print(f"The index of '{target_string}' is: {index}")
else:
    print(f"The string '{target_string}' is not in the list.")

The index of 'Notes_text_shrishti_chunk_7024.txt' is: 0


In [7]:
os.getcwd()

'/home/jupyter/finetune_LLM'

In [8]:
from peft import LoraConfig, PeftModel
from peft import AutoPeftModelForCausalLM

In [9]:
#adapter = "biomistral-snp-finetune-classifier-2024_130624_v2/checkpoint-1000" #RGCIC
adapter = "finetuned_LLMbiomistral-snp-finetune-shrishti-classifier-2024_030724_v1/checkpoint-1000" #Shrishti

In [10]:
#Load and activate the adapter on top of the base model
ft_model_fast = PeftModel.from_pretrained(base_model, adapter)

In [11]:
#Merge the adapter with the base model
ft_model_fast = ft_model_fast.merge_and_unload()



In [None]:
#Save the merged model in a directory in the safetensors format
model_dir = "./biomistral-snp-finetune-classifier-2024_130624_v2/merged_model/"
ft_model_fast.save_pretrained(model_dir, safe_serialization=True)

#Save the custom tokenizer in the same directory
eval_tokenizer.save_pretrained(model_dir)

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="./biomistral-snp-finetune-classifier-2024_130624_v2/merged_model/")

In [12]:
from peft import PeftModel

#ft_model = PeftModel.from_pretrained(base_model, "biomistral-snp-finetune-classifier-2024_130624_v2/checkpoint-1000") #RGCIC
ft_model = PeftModel.from_pretrained(base_model, adapter) #Shrishti

In [None]:
#RGCIC
eval_prompt = f"""Classify the notes into below categories:
                   'allergies',
                   'chief_complaints',
                   'diagnosis',
                   'family_history',
                   'history',
                   'instructions_advice',
                   'investigation_report',
                   'investigations',
                   'medicine_prescription',
                   'observations_examinations',
                   'patient_willingness_concent',
                   'personal_history',
                   'procedure_report',
                   'referral',
                   'social_history',
                   'tolerance',
                   'treatment_plan',
                   'unclassified',
                   'vitals'
                    Output the classified data into json format


### notes:
{dataset["test"]["notes"][14]}

### Classified:
"""
print(eval_prompt)

In [13]:
#Shrishti
eval_prompt = f"""Classify the notes into below categories:
                   'recipere', 
                   'investigations', 
                   'plan', 
                   'complaints',
                   'history_of_previous_illness', 
                   'examination', 
                   'diagnoses',
                    Output the classified data into json format


### notes:
{dataset["test"]["notes"][1]}

### Classified:
"""
print(eval_prompt)

Classify the notes into below categories:
                   'recipere', 
                   'investigations', 
                   'plan', 
                   'complaints',
                   'history_of_previous_illness', 
                   'examination', 
                   'diagnoses',
                    Output the classified data into json format


### notes:
oefgc, vitals bp 14485 pr 66 spo2 99% temp 35.9 rbs 7.2 mmollcvs s1, s2 heard no murmursresp bilateral vesicular breath soundspa normal scaphoid abdomenno bipedal edemacns gcs1515, pupils berl, no neurological deficits

### Classified:



In [14]:
dataset["test"]["classified"][14]

"{'recipere': ['terbutaline+ambroxol+guaifensine syrup, dosage: 1, route: 106.0, qty: 1.0, duration: 1.0, dura_unit: 193.0, instructions: . ibuprofen+paracetamol (125+100)mg/5ml suspension 100ml, dosage: 1, route: 106.0, qty: 1.0, duration: 5.0, dura_unit: 193.0, instructions: take 5ml thrice daily cefuroxime 250mg/5ml suspension, dosage: 1, route: 106.0, qty: 1.0, duration: 5.0, dura_unit: 193.0, instructions: take 5ml twice daily paracetamol injection 10ml, dosage: 1, route: 106.0, qty: 1.0, duration: 1.0, dura_unit: 193.0, instructions: . diphenhydramine+sodium citrate+menthol (7+28.5+0.55)mg/5ml syrup 100ml, dosage: 1, route: 106.0, qty: 1.0, duration: 5.0, dura_unit: 193.0, instructions: take 5ml thrice daily ebastine 10mg tablets, dosage: 1, route: 106.0, qty: 1.0, duration: 1.0, dura_unit: 194.0, instructions: take 5ml at night'], 'complaints': ['Persistent cough with chest pain. Loss of voice. Hot flashes with chills. Headaches.', 'Abdominal pain, fever, diarrhea', 'Headache, s

In [None]:
dataset["test"]['filename']

In [16]:
len(dataset["test"])

1053

In [17]:
from peft import PeftModel

#ft_model = PeftModel.from_pretrained(base_model, "biomistral-snp-finetune-classifier-2024_130624_v2/checkpoint-1000") #RGCIC
ft_model = PeftModel.from_pretrained(base_model, adapter) #Shrishti

In [None]:
#RGCIC
def prompt_func(i):
    eval_prompt = f"""Classify the notes into below categories:
                   'allergies',
                   'chief_complaints',
                   'diagnosis',
                   'family_history',
                   'history',
                   'instructions_advice',
                   'investigation_report',
                   'investigations',
                   'medicine_prescription',
                   'observations_examinations',
                   'patient_willingness_concent',
                   'personal_history',
                   'procedure_report',
                   'referral',
                   'social_history',
                   'tolerance',
                   'treatment_plan',
                   'unclassified',
                   'vitals'
                    Output the classified data into json format

    ### notes:
    {dataset["test"]["notes"][i]}

    ### Classified:
    """
    return eval_prompt

In [18]:
#Shrishti
def prompt_func(i):
    eval_prompt = f"""Classify the notes into below categories:
                   'recipere', 
                   'investigations', 
                   'plan', 
                   'complaints',
                   'history_of_previous_illness', 
                   'examination', 
                   'diagnoses',
                    Output the classified data into json format


    ### notes:
    {dataset["test"]["notes"][i]}

    ### Classified:
    """
    return eval_prompt

In [19]:
import re
import json

def ft_model_func(eval_prompt):
    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")
    ft_model.eval()
    with torch.no_grad():
        generated_tokens = ft_model.generate(
            **model_input,
            max_new_tokens=1100, #1048,
            num_beams=5,
            temperature=0.0,
            top_k=10,
            top_p=0.9,
            repetition_penalty=2.0,
            early_stopping=True
        )

    decoded_outputs = eval_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    match = re.search(r'### Classified:\s*({.*?})\s*(?=\n|$)', decoded_outputs[0], re.DOTALL)

    df = pd.DataFrame()
    df_long = pd.DataFrame()

    if match:
        classified_data = match.group(1)
        print("Extracted Data:", classified_data)

        classified_dict = json.loads(classified_data.replace("'", '"'))
        print("Dictionary Format:", classified_dict)

        df = pd.DataFrame.from_dict({key: pd.Series(value) for key, value in classified_dict.items()})
        #print(df)
    else:
        print("No classified data found.")

    if not df.empty:

        df_long = df.melt(var_name='Label', value_name='Text', value_vars=df.columns)

        df_long.dropna(subset=['Text'], inplace=True)

        print("\nTransformed DataFrame:")
        df_long.reset_index(drop=True, inplace=True)
        print(df_long)

    return df_long


In [20]:
os.getcwd()

'/home/jupyter/finetune_LLM'

In [None]:
dataset["test"]['filename']

In [None]:
import time

file_name = []
infer_time = []
for i in range(len(dataset["test"])):
    #if i == 14:
    eval_prompt = prompt_func(i)
    #print(eval_prompt)
    file_name.append(dataset["test"]['filename'][i])
    #print(dataset["test"]['filename'][i].split(".")[-2])
    path = "/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/test/" #"/home/necuser/sdp/Classification/LLM/Finetuining/130624/data/result/test"
    csv_file = dataset["test"]['filename'][i].split(".")[-2] + ".csv"
    csv_path = os.path.join(path, csv_file)
    print(csv_path)
    start_time = time.time()
    df_long = ft_model_func(eval_prompt)
    end_time = time.time()
    if not df_long.empty:
        df_long.to_csv(csv_path, index=False)
        infer_time.append(round(end_time-start_time, 2))
        if i > 10:
            break
    else:
        continue

In [28]:
infer_time

[187.43,
 31.47,
 188.25,
 187.49,
 186.77,
 188.45,
 187.07,
 187.16,
 187.81,
 188.8,
 188.2,
 188.99]

In [29]:
file_name

['Notes_text_shrishti_chunk_7024.txt',
 'Notes_text_shrishti_chunk_9370.txt',
 'Notes_text_shrishti_chunk_3347.txt',
 'Notes_text_shrishti_chunk_6411.txt',
 'Notes_text_shrishti_chunk_6434.txt',
 'Notes_text_shrishti_chunk_4285.txt',
 'Notes_text_shrishti_chunk_5829.txt',
 'Notes_text_shrishti_chunk_7582.txt',
 'Notes_text_shrishti_chunk_3564.txt',
 'Notes_text_shrishti_chunk_9226.txt',
 'Notes_text_shrishti_chunk_7283.txt',
 'Notes_text_shrishti_chunk_6665.txt']

In [32]:
import pandas as pd
import os
from sklearn.metrics import classification_report

merged_folder_path = "/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/merged/" #"/home/necuser/sdp/Classification/LLM/Finetuining/130624/data/result/merged"
gen_folder_path = "/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/test/" #"/home/necuser/sdp/Classification/LLM/Finetuining/130624/data/result/test"
gt_folder_path = "/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/ground_truth/" #"/home/necuser/sdp/Classification/LLM/Finetuining/130624/data/result/ground_truth"

files = os.listdir(gen_folder_path) #os.listdir(folder_path)

csv_files = [file for file in files if file.endswith('.csv')]

for csv_file in csv_files:
    gen_file_path = os.path.join(gen_folder_path, csv_file) #os.path.join(folder_path, csv_file)
    print(gen_file_path)
    gt_file_path = os.path.join(gt_folder_path, csv_file)
    print(gt_file_path)
    gen_df = pd.read_csv(gen_file_path)
    if os.path.isfile(gt_file_path):
        gt_df = pd.read_csv(gt_file_path)
        # Apply lower case to the 'Text' column
        gt_df['Label'] = gt_df['Label'].str.lower()
        gt_df['Text'] = gt_df['Text'].str.lower()
        gen_df['Text'] = gen_df['Text'].str.lower()
        # Merging the DataFrames on the 'Text' column using a left join
        merged_df = pd.merge(gt_df, gen_df, on='Text', how='left', suffixes=('_gt_df', '_gen_df'))

        print(f"Data from merged data: {csv_file}:")
        # Replace NaN values with "Null"
        merged_df.fillna("Null", inplace=True)

        merged_csv_path = os.path.join(merged_folder_path, csv_file)
        merged_df.to_csv(merged_csv_path, index=False)
        # Display the merged DataFrame
        #print(merged_df)

        print(classification_report(merged_df['Label_gt_df'], merged_df['Label_gen_df']))

    #print(f"Data from llm generated: {csv_file}:")
    #print(gen_df)
    #print(f"Data from ground truth: {csv_file}:")
    #print(gt_df)
    print("\n")

    #break

/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/test/Notes_text_shrishti_chunk_6434.csv
/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/ground_truth/Notes_text_shrishti_chunk_6434.csv
Data from merged data: Notes_text_shrishti_chunk_6434.csv:
              precision    recall  f1-score   support

    recipere       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/test/Notes_text_shrishti_chunk_9370.csv
/home/jupyter/finetune_LLM/finetuned_LLM_result/shrishti/ground_truth/Notes_text_shrishti_chunk_9370.csv
Data from merged data: Notes_text_shrishti_chunk_9370.csv:
              precision    recall  f1-score   support

 examination       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
#Missing Information
import pandas as pd
import os

merged_folder_path = "/home/necuser/sdp/Classification/LLM/Finetuining/130624/data/result/merged/"

files = os.listdir(merged_folder_path)

csv_files = [file for file in files if file.endswith('.csv')]

dataframes = []

for csv_file in csv_files:
    file_path = os.path.join(merged_folder_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

merged_df = pd.concat(dataframes, ignore_index=True)

#print(merged_df)

null_count = merged_df[merged_df['Label_gen_df'] == 'Null'].shape[0]

print(f"Number of rows where 'Label_gen_df' is 'Null': {null_count}")

null_percentage = (null_count / len(merged_df)) * 100

print(f"Percentage of 'Null' values in 'Label_gen_df': {null_percentage:.2f}%")

merged_df_cleaned = merged_df[merged_df['Label_gen_df'] != 'Null']

print("\nCleaned DataFrame:")
#print(merged_df_cleaned)
print(classification_report(merged_df_cleaned['Label_gt_df'], merged_df_cleaned['Label_gen_df']))

#merged_df_cleaned.to_csv(os.path.join(merged_folder_path, "cleaned_merged_result.csv", index=False)

merged_df[merged_df['Label_gen_df'] == 'prior_treatment']

In [None]:
merged_df[merged_df['Label_gen_df'] == 'Null']

In [None]:
#observations
1. Hallucination - More Training data, steps, parameters tweaking
2. Missing data(input notes) during inerenceing - More Training data, steps, parameters tweaking
3. Generated output need post processing
4. Need to test on all training and test data
5. Inference Time is around 40-50 secs - Explore methods for faster inferencing
6. Chunk overlapping - Retraining
7. Data Augmentation - Retraining
6.


In [None]:
%%time
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    generated_tokens = ft_model.generate(
        **model_input,
        max_new_tokens=1100, #1048,
        num_beams=5,
        temperature=0.0,
        top_k=10,
        top_p=0.9,
        repetition_penalty=2.0,
        early_stopping=True
    )


#outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
#eval_tokenizer.batch_decode(generated_tokens)

# Decode the outputs
decoded_outputs = eval_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

# Print the decoded outputs
for output in decoded_outputs:
    print(output)

In [None]:
#%%time
#model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

#ft_model.eval()
#with torch.no_grad():
#    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=1048)[0], skip_special_tokens=True))

In [None]:
#%%time
#model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

#ft_model.eval()
#with torch.no_grad():
#    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=1048)[0], skip_special_tokens=True))

In [None]:
#%%time
#model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

#ft_model.eval()
#with torch.no_grad():
#    generated_tokens = ft_model.generate(
#        **model_input,
#        max_new_tokens=1048,
#        num_beams=5,
#        temperature=0.0,
#        top_k=10,
#        top_p=0.9,
#        repetition_penalty=2.0,
#        early_stopping=True
#    )
#    print(eval_tokenizer.decode(generated_tokens[0], skip_special_tokens=True))

In [None]:
%%time
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    generated_tokens = ft_model.generate(
        **model_input,
        max_new_tokens=1100, #1048,
        num_beams=5,
        temperature=0.0,
        top_k=10,
        top_p=0.9,
        repetition_penalty=2.0,
        early_stopping=True
    )


#outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
#eval_tokenizer.batch_decode(generated_tokens)

# Decode the outputs
decoded_outputs = eval_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

# Print the decoded outputs
for output in decoded_outputs:
    print(output)

In [None]:
# Print the decoded outputs
#for output in decoded_outputs:
#    print(output)
#    break

In [None]:
#decoded_outputs[0]

In [None]:
#import re
# Extract information after "### Classified:"
#regex_pattern = r'### Classified:\s*(.+)'
#classified_data = re.search(regex_pattern, decoded_outputs[0], re.DOTALL)

#if classified_data:
#    classified_info = classified_data.group(1).strip()  # Extract the match and strip extra whitespace
#    print(classified_info)
#else:
#    print("No classified data found.")

In [None]:
import re
import json
# Regular expression to extract the JSON-like dictionary after "### Classified:"
match = re.search(r'### Classified:\s*({.*?})\s*(?=\n|$)', decoded_outputs[0], re.DOTALL)

if match:
    classified_data = match.group(1)
    print("Extracted Data:", classified_data)

    # Convert the string to a dictionary
    classified_dict = json.loads(classified_data.replace("'", '"'))  # Replace single quotes with double quotes for valid JSON
    print("Dictionary Format:", classified_dict)

    # Convert the dictionary to a DataFrame for better visualization or further processing
    df = pd.DataFrame.from_dict({key: pd.Series(value) for key, value in classified_dict.items()})
    print(df)
else:
    print("No classified data found.")

In [None]:
 df

In [None]:
# Transform the DataFrame
# 'value_vars' is optional if you want to transform all columns
df_long = df.melt(var_name='Label', value_name='Text', value_vars=df.columns)

# Remove rows where 'Text' is None or NaN (if you need to clean up missing data)
df_long.dropna(subset=['Text'], inplace=True)

# Display the transformed DataFrame
print("\nTransformed DataFrame:")
print(df_long)

In [None]:
df_long

In [None]:
# Reset the index
df_long.reset_index(drop=True, inplace=True)

In [None]:
df_long

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['notes'])):
        text = f"""Classify the notes into below categories:
                   'allergies',
                   'chief_complaints',
                   'diagnosis',
                   'family_history',
                   'history',
                   'instructions_advice',
                   'investigation_report',
                   'investigations',
                   'medicine_prescription',
                   'observations_examinations',
                   'patient_willingness_concent',
                   'personal_history',
                   'procedure_report',
                   'referral',
                   'social_history',
                   'tolerance',
                   'treatment_plan',
                   'unclassified',
                   'vitals'
                    Output the classified data into json format


        ### notes:
        {example["notes"]}

        ### Classified:
        {example["classified"]}
        """
        output_texts.append(text)
    return output_texts