In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)

In [None]:
import json

dat_file_path = 'root/llama/datasets/train.dat'  # Replace with the path to your .dat file
json_file_path = 'root/llama/datasets/classification_train.json'  # Replace with your desired path for the .json file

# Read the .dat file and process each line
data = []
with open(dat_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Check if line is not empty
            parts = line.strip().split('\t')  # Splitting by the tab
            if len(parts) >= 2:
                label = parts[0]
                text = ' '.join(parts[1:])  # Join the remaining parts as text
                data.append({'label': label, 'clinical_conditions': text})

# Write the data to a JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, indent=4)


In [None]:
import json

dat_file_path = 'root/llama/datasets/test.dat'  # Replace with the path to your .dat file
json_file_path = 'root/llama/datasets/classification_test.json'  # Replace with your desired path for the .json file

# Read the .dat file and process each line
data = []
with open(dat_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Check if line is not empty
            parts = line.strip().split('\t')  # Splitting by the tab
            if len(parts) >= 2:
                label = parts[0]
                text = ' '.join(parts[1:])  # Join the remaining parts as text
                data.append({'label': label, 'clinical_conditions': text})

# Write the data to a JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, indent=4)

In [None]:
import json

# Path to your JSON file
json_file_path = 'root/llama/datasets/classification_train.json'  # Update with the actual path to your JSON file

# Read the JSON file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Print the type of data and its first element (if available)
print("Type of JSON data:", type(data))
if isinstance(data, list) and len(data) > 0:
    print("Type of first element:", type(data[0]))
    print("First element:", data[5])

In [None]:
import json

# Defining labels to disease mapping
label_to_disease = {
    1: "neoplasms",
    2: "digestive system diseases",
    3: "nervous system diseases",
    4: "cardiovascular diseases",
    5: "general pathological conditions"
}

# Path to your original JSON file and the new file
json_file_path = '/root/llama/datasets/classification_train.json'
new_json_file_path = '/root/llama/datasets/classification_task_new_labels.json'

# Read the original JSON data
with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Update the labels
for entry in data:
    # Convert the label to an integer if it's a string
    label = int(entry['label']) if isinstance(entry['label'], str) else entry['label']
    # Update the label to its disease name
    if label in label_to_disease:
        entry['label'] = label_to_disease[label]

# Write the updated data to a new JSON file
with open(new_json_file_path, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)


In [None]:
import json
# Path to your updated JSON file
updated_json_file_path = '/root/llama/datasets/classification_task_new_labels.json'  

# Read the updated JSON file
with open(updated_json_file_path, 'r', encoding='utf-8') as file:
    updated_data = json.load(file)

# Number of entries to display
num_entries_to_display = 5

# Print the first few entries to check the labels
for entry in updated_data[:num_entries_to_display]:
    print(f"Label: {entry['label']}")
    print(f"Text: {entry['clinical_conditions']}\n")


In [None]:
from datasets import load_dataset

# Path to your updated JSON file
updated_json_file_path = '/root/llama/datasets/classification_task_new_labels.json'  

# Load the dataset
dataset = load_dataset('json', data_files=updated_json_file_path)

# Check the dataset structure
print(dataset)


In [None]:
print(dataset['train'][0])

In [None]:
# Model 
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Convert model to use 16-bit floating point precision (half precision)
model = model.half()  # This converts all the model weights to float16

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

In [None]:
prompts = [
    f"Medical Diagnosis Task: You are presented with a clinical description. Based on this description, identify the most appropriate medical category for diagnosis. The categories are: \n 1. Neoplasms\n 2. Digestive System Diseases\n 3. Nervous System Diseases\n 4. Cardiovascular Diseases\n 5. General Pathological Conditions\n\nClinical Description: '{dataset['train'][i]['clinical_conditions']}'\nDiagnosis: "
    for i in range(20)
]


In [None]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Print the device being used
print(f"Using device: {device}")

# Check if the model is wrapped in DataParallel and unwrap it if necessary
if isinstance(model, torch.nn.DataParallel):
    # Extract the original model from DataParallel wrapper
    unwrapped_model = model.module
else:
    unwrapped_model = model

# Move the unwrapped model to the defined device (GPU or CPU)
unwrapped_model.to(device)

In [None]:
model_outputs = []
for prompt in prompts:
    input_ids = llama_tokenizer.encode(prompt, return_tensors='pt').to(device)
    output_ids = unwrapped_model.generate(input_ids, max_length=512, temperature=0.4, top_p=0.6)
    generated_text = llama_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Post-processing to remove the echoed prompt
    if generated_text.startswith(prompt):
        generated_text = generated_text[len(prompt):].strip()

    # Append the generated text directly
    model_outputs.append(generated_text)

In [None]:
#extract generated plus actual summaries
generated_summaries = model_outputs
actual_labels = [dataset['train'][i]['label'] for i in range(20)]

In [None]:
import re
from fuzzywuzzy import fuzz
from collections import Counter
import numpy as np

# Define the regular expression pattern to capture the relevant part
pattern = r'(1|2|3|4|5|[A-Za-z\s]+|[1-5][.: -]\s*[A-Za-z\s]+)'

# Define the label mapping
label_mapping = {
    '1': 'neoplasms',
    '2': 'digestive system diseases',
    '3': 'nervous system diseases',
    '4': 'cardiovascular diseases',
    '5': 'general pathological conditions',
    'neoplasms': 'neoplasms',
    'digestive system diseases': 'digestive system diseases',
    'nervous system diseases': 'nervous system diseases',
    'cardiovascular diseases': 'cardiovascular diseases',
    'general pathological conditions': 'general pathological conditions',
    '1. neoplasms': 'neoplasms',
    '2. digestive system diseases': 'digestive system diseases',
    '3. nervous system diseases': 'nervous system diseases',
    '4. cardiovascular diseases': 'cardiovascular diseases',
    '5. general pathological conditions': 'general pathological conditions',
    '1 - neoplasms': 'neoplasms',
    '2 - digestive system diseases': 'digestive system diseases',
    '3 - nervous system diseases': 'nervous system diseases',
    '4 - cardiovascular diseases': 'cardiovascular diseases',
    '5 - general pathological conditions': 'general pathological conditions',
    '1: neoplasms': 'neoplasms',
    '2: digestive system diseases': 'digestive system diseases',
    '3: nervous system diseases': 'nervous system diseases',
    '4: cardiovascular diseases': 'cardiovascular diseases',
    '5: general pathological conditions': 'general pathological conditions',
}


# Extract the relevant part from each generated output and map it to actual labels
extracted_parts = []

for output in generated_summaries:
    matches = re.findall(pattern, output)
    extracted_part = None
    
    for match in matches:
        match = match.strip()
        if match in label_mapping:
            extracted_part = match
            break
    
    if extracted_part and extracted_part != "Unknown":
        mapped_label = label_mapping[extracted_part]
        extracted_parts.append(mapped_label)

# Filter out "Unknown" responses from the actual_labels
filtered_actual_labels = [label for label in actual_labels if label != "Unknown"]

# Calculate fuzzy matching scores and check if they exceed the threshold
threshold = 80  # You can adjust the threshold as needed
correct_predictions = [fuzz.ratio(actual, extracted) >= threshold for actual, extracted in zip(filtered_actual_labels, extracted_parts)]

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)

# Initialize counters for precision, recall, and F1 calculation
true_positives = Counter()
false_positives = Counter()
false_negatives = Counter()

for i, correct in enumerate(correct_predictions):
    if correct:
        true_positives[filtered_actual_labels[i]] += 1
    else:
        false_positives[extracted_parts[i]] += 1
        false_negatives[filtered_actual_labels[i]] += 1

# Calculate precision, recall, and F1 for each label
precision = {label: true_positives[label] / (true_positives[label] + false_positives[label]) if (true_positives[label] + false_positives[label]) != 0 else 0 for label in set(filtered_actual_labels)}
recall = {label: true_positives[label] / (true_positives[label] + false_negatives[label]) if (true_positives[label] + false_negatives[label]) != 0 else 0 for label in set(filtered_actual_labels)}
f1 = {label: 2 * (precision[label] * recall[label]) / (precision[label] + recall[label]) if (precision[label] + recall[label]) != 0 else 0 for label in set(filtered_actual_labels)}

# Calculate weighted averages
weighted_precision = np.average(list(precision.values()), weights=[true_positives[label] + false_positives[label] for label in set(filtered_actual_labels)])
weighted_recall = np.average(list(recall.values()), weights=[true_positives[label] + false_negatives[label] for label in set(filtered_actual_labels)])
weighted_f1 = np.average(list(f1.values()), weights=[true_positives[label] + false_positives[label] for label in set(filtered_actual_labels)])

# Print or use the calculated metrics as needed
print("Accuracy:", accuracy)
print("Weighted Precision:", weighted_precision)
print("Weighted Recall:", weighted_recall)
print("Weighted F1:", weighted_f1)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the label mapping
label_mapping = {
    '1': 'neoplasms',
    '2': 'digestive system diseases',
    '3': 'nervous system diseases',
    '4': 'cardiovascular diseases',
    '5': 'general pathological conditions',
    'neoplasms': 'neoplasms',
    'digestive system diseases': 'digestive system diseases',
    'nervous system diseases': 'nervous system diseases',
    'cardiovascular diseases': 'cardiovascular diseases',
    'general pathological conditions': 'general pathological conditions',
    '1. neoplasms': 'neoplasms',
    '2. digestive system diseases': 'digestive system diseases',
    '3. nervous system diseases': 'nervous system diseases',
    '4. cardiovascular diseases': 'cardiovascular diseases',
    '5. general pathological conditions': 'general pathological conditions',
    '1 - neoplasms': 'neoplasms',
    '2 - digestive system diseases': 'digestive system diseases',
    '3 - nervous system diseases': 'nervous system diseases',
    '4 - cardiovascular diseases': 'cardiovascular diseases',
    '5 - general pathological conditions': 'general pathological conditions',
    '1: neoplasms': 'neoplasms',
    '2: digestive system diseases': 'digestive system diseases',
    '3: nervous system diseases': 'nervous system diseases',
    '4: cardiovascular diseases': 'cardiovascular diseases',
    '5: general pathological conditions': 'general pathological conditions',
}

# Extract the relevant part from each generated output and map it to actual labels
mapped_labels = []

for output in generated_summaries:
    matches = re.findall(r'(1|2|3|4|5|[A-Za-z\s]+|[1-5][.: -]\s*[A-Za-z\s]+)', output)
    mapped_label = None

    for match in matches:
        match = match.strip()
        if match in label_mapping:
            mapped_label = label_mapping[match]
            break

    if mapped_label:
        mapped_labels.append(mapped_label)
    else:
        mapped_labels.append("Unknown")

# Filter out "Unknown" responses from the actual_labels
filtered_actual_labels = [label for label in actual_labels if label != "Unknown"]

# Calculate accuracy
accuracy = accuracy_score(filtered_actual_labels, mapped_labels)

# Calculate precision, recall, and F1-score
precision = precision_score(filtered_actual_labels, mapped_labels, average='weighted')
recall = recall_score(filtered_actual_labels, mapped_labels, average='weighted')
f1 = f1_score(filtered_actual_labels, mapped_labels, average='weighted')

# Print or use the calculated metrics as needed
print("Accuracy:", accuracy)
print("Weighted Precision:", precision)
print("Weighted Recall:", recall)
print("Weighted F1:", f1)


In [None]:
import matplotlib.pyplot as plt

# Labels for the metrics
labels = ['Accuracy', 'Weighted Precision', 'Weighted Recall', 'Weighted F1']

# Values for the metrics (replace with your actual calculated values)
values = [accuracy, precision, recall, f1]

# Print the calculated metrics
print("Metrics:")
for label, value in zip(labels, values):
    print(f"{label}: {value}")

# Create a bar plot
plt.bar(labels, values, color=['blue', 'green', 'orange', 'red'])

# Add labels and title
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Evaluation Metrics')

# Display the plot
plt.show()


In [None]:
# Print the 25th entry
index = 17  # 25th entry has index 24

print(f"Prompt:\n{prompts[index]}\n")
print(f"Generated Label:\n{generated_summaries[index]}\n")
print(f"Actual Label:\n{actual_labels[index]}\n")
print("---------------------------------------------------\n")
