In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  

In [None]:
import json
# Path to the processed medical text dataset which contains labels in words rather than numbers
updated_json_file_path = '/root/llama/datasets/classification_task_new_labels.json'  

# Read the updated JSON file
with open(updated_json_file_path, 'r', encoding='utf-8') as file:
    updated_data = json.load(file)

# Let's display five first entries
num_entries_to_display = 5

# Print those entries to check the labels
for entry in updated_data[:num_entries_to_display]:
    print(f"Label: {entry['label']}")
    print(f"Text: {entry['clinical_conditions']}\n")

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

df = pd.DataFrame(updated_data)
dataset = Dataset.from_pandas(df)

train_testvalid_split = dataset.train_test_split(test_size=0.3)

# Splitting the 30% into half for validation and test
test_valid_split = train_testvalid_split['test'].train_test_split(test_size=0.5)

# Combining splits into a DatasetDict
split_datasets = DatasetDict({
    'train': train_testvalid_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})
split_datasets
train_dataset = split_datasets['train']
validation_dataset = split_datasets['validation']
test_dataset = split_datasets['test']


In [None]:
#adding another 'text' column
def add_text_column(example):
    new_text = "The following is a clinical description that corresponds to a specific medical condition. The description: '" + example['clinical_conditions'] + "'. Its diagnosis: '" + example['label'] + "'."
    return {'text': new_text}

# Applying the function to each dataset
train_dataset = train_dataset.map(add_text_column)
validation_dataset = validation_dataset.map(add_text_column)
test_dataset = test_dataset.map(add_text_column)


In [None]:
from peft import LoraConfig, get_peft_model

peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 50
learning_rate = 2e-4
max_grad_norm = 0.4
#max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    evaluation_strategy="steps", 
    eval_steps=100,
    load_best_model_at_end=True,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    #max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
from trl import SFTTrainer

max_seq_length = 400

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset, 
    eval_dataset=validation_dataset,
    peft_config=peft_parameters,
    dataset_text_field="text",  
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
use_wandb = False
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
from transformers import PreTrainedTokenizer, PreTrainedModel

# We will initialize an empty list to store the prompts
generated_summaries = []
prompts = []
num_descriptions_to_generate = 50

# Iterating over the dataset to create prompts
for index, data in enumerate(train_dataset):
    if index >= num_descriptions_to_generate:
        break
    clinical_conditions = data['clinical_conditions']
#    prompt = (
#        "Medical Diagnosis Task: You are presented with a clinical description. "
#        "Based on this description, identify the most appropriate medical category for diagnosis. Respond with one of the following categories, no explanations are needed. "
#        "The categories are: \n"
#        "1. Neoplasms\n"
#        "2. Digestive System Diseases\n"
#        "3. Nervous System Diseases\n"
#        "4. Cardiovascular Diseases\n"
#        "5. General Pathological Conditions\n\n"
#        f"Clinical Description: '{clinical_conditions}'\n"
#        "Diagnosis: "
#    )
    prompt = (
    "User: Can you help me diagnose a medical condition based on a clinical description? Please only state the diagnosis.\n"
    "Assistant: Of course. Please provide the clinical description, and I will identify the most appropriate medical category for diagnosis. Respond with one of the following categories, no explanations are needed. The categories are: \n"
    "1. Neoplasms\n"
    "2. Digestive System Diseases\n"
    "3. Nervous System Diseases\n"
    "4. Cardiovascular Diseases\n"
    "5. General Pathological Conditions\n\n"
    f"User: Here is the clinical description: '{clinical_conditions}'.\n"
    "Assistant: The diagnosis is: "
)

    prompts.append(prompt)

# Here we generate responses using the model
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=400)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Removing the prompt part from the generated text
    prompt_end_marker = "Assistant: The diagnosis is: "
    if prompt_end_marker in generated_text:
        start_index = generated_text.find(prompt_end_marker) + len(prompt_end_marker)
        filtered_text = generated_text[start_index:].strip()
    else:
        filtered_text = generated_text

    generated_summaries.append(filtered_text)

actual_labels = [train_dataset[i]['label'] for i in range(num_descriptions_to_generate)]

In [None]:
display_index = 6  # for testing and viewing the results

# Print the data at the chosen index
print("Prompt at Index {}:".format(display_index))
print(prompts[display_index])
print("\nGenerated Response at Index {}:".format(display_index))
print(generated_summaries[display_index])
print("\nActual Label at Index {}:".format(display_index))
print(actual_labels[display_index])

In [None]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Definiton of the regular expression pattern based on the observed responses
pattern = r'(1|2|3|4|5|[A-Za-z\s]+|[1-5][.: -]\s*[A-Za-z\s]+)'

# Label mapping defining
label_mapping = {
    '1': 'neoplasms',
    '2': 'digestive system diseases',
    '3': 'nervous system diseases',
    '4': 'cardiovascular diseases',
    '5': 'general pathological conditions',
    'neoplasms': 'neoplasms',
    'digestive system diseases': 'digestive system diseases',
    'nervous system diseases': 'nervous system diseases',
    'cardiovascular diseases': 'cardiovascular diseases',
    'general pathological conditions': 'general pathological conditions',
    '1. neoplasms': 'neoplasms',
    '2. digestive system diseases': 'digestive system diseases',
    '3. nervous system diseases': 'nervous system diseases',
    '4. cardiovascular diseases': 'cardiovascular diseases',
    '5. general pathological conditions': 'general pathological conditions',
    '1 - neoplasms': 'neoplasms',
    '2 - digestive system diseases': 'digestive system diseases',
    '3 - nervous system diseases': 'nervous system diseases',
    '4 - cardiovascular diseases': 'cardiovascular diseases',
    '5 - general pathological conditions': 'general pathological conditions',
    '1: neoplasms': 'neoplasms',
    '2: digestive system diseases': 'digestive system diseases',
    '3: nervous system diseases': 'nervous system diseases',
    '4: cardiovascular diseases': 'cardiovascular diseases',
    '5: general pathological conditions': 'general pathological conditions',
    '1 (Neoplasms)': 'neoplasms',
    '2 (Digestive System Diseases)': 'digestive system diseases',
    '3 (Nervous System Diseases)': 'nervous system diseases',
    '4 (cardiovascular diseases)': 'cardiovascular diseases',
    '5 (General Pathological Conditions)': 'general pathological conditions',
}

# Label keywords defining
label_keywords = {
    'neoplasms': ['neoplasms'],
    'digestive system diseases': ['digestive system diseases'],
    'nervous system diseases': ['nervous system diseases'],
    'cardiovascular diseases': ['cardiovascular diseases'],
    'general pathological conditions': ['general pathological conditions'],
}

# Function to find label based on keywords
def find_label_based_on_keywords(text):
    for label, keywords in label_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                return label
    return None

# Extract relevant part from each output and map to actual labels
extracted_parts = []

for output in generated_summaries:
    label_found = find_label_based_on_keywords(output)
    if label_found:
        extracted_parts.append(label_found)
    else:
        matches = re.findall(pattern, output)
        extracted_part = None
        for match in matches:
            match = match.strip()
            if match in label_mapping:
                extracted_part = match
                break
        if extracted_part:
            mapped_label = label_mapping[extracted_part]
            extracted_parts.append(mapped_label)
        else:
            extracted_parts.append("Unknown")

# Filter out "Unknown" responses from actual_labels and extracted_parts
filtered_actual_labels = [label for label, predicted in zip(actual_labels, extracted_parts) if predicted != "Unknown"]
filtered_extracted_parts = [predicted for predicted in extracted_parts if predicted != "Unknown"]

# Ensure both lists are of the same length
if len(filtered_actual_labels) != len(filtered_extracted_parts):
    raise ValueError("The number of actual labels and predicted labels must be the same.")

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(filtered_actual_labels, filtered_extracted_parts)
precision = precision_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))
recall = recall_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))
f1 = f1_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))

# Print the calculated metrics
print("Accuracy:", accuracy)
print("Weighted Precision:", precision)
print("Weighted Recall:", recall)
print("Weighted F1 Score:", f1)
