In [None]:
#Loading the embedding model first
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

# Confirm GPU usage
if cuda.is_available():
    device = f'cuda:{cuda.current_device()}'
    print("Using GPU:", device)
else:
    device = 'cpu'
    print("Using CPU")

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)

In [None]:
import json
# Path to the updated JSON file
updated_json_file_path = '/root/llama/datasets/classification_task_new_labels.json'  

# Read the updated JSON file
with open(updated_json_file_path, 'r', encoding='utf-8') as file:
    updated_data = json.load(file)

# Number of entries to display
num_entries_to_display = 5

# Print the first few entries to check the labels
for entry in updated_data[:num_entries_to_display]:
    print(f"Label: {entry['label']}")
    print(f"Text: {entry['clinical_conditions']}\n")

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
df = pd.DataFrame(updated_data)
dataset = Dataset.from_pandas(df)


In [None]:
dataset
train_test_split = dataset.train_test_split(test_size=0.8)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


In [None]:
import os
import pinecone

# setting the API key from Pinecone and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or '27f3cdbb-b72d-4e4b-9351-e221b0e9deae',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or 'gcp-starter'
)

In [None]:
docs = [
    "document sample 1",
    "document sample 2"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

In [None]:
import time

index_name = 'thesis-days'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [None]:
#pinecone.delete_index(index_name)

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
import pandas as pd
dataset = train_dataset.to_pandas()

batch_size = 32
id_offset = 0

for i in range(0, len(dataset), batch_size):
    i_end = min(len(dataset), i + batch_size)
    batch = dataset.iloc[i:i_end]

    # Assuming 'label' and 'clinical_conditions' are the column names
    chunk_ids = [str(id_offset + i + 1) for i, _ in enumerate(batch.iterrows())]

    label_texts = [x['label'] for _, x in batch.iterrows()]
    clinical_conditions_texts = [x['clinical_conditions'] for _, x in batch.iterrows()]

    label_embeds = embed_model.embed_documents(label_texts)
    clinical_conditions_embeds = embed_model.embed_documents(clinical_conditions_texts)

    metadata_chunks = [
    {'text': "The following is a clinical description that corresponds to a specific medical condition. The description: '" + clinical_conditions + "' It's diagnosis: '" + label + "'."} 
    for label, clinical_conditions in zip(label_texts, clinical_conditions_texts)
    ]
    # Assuming the Pinecone index is named 'index'
    for chunk_id, label_embed, clinical_conditions_embed, metadata_chunk in zip(chunk_ids, label_embeds, clinical_conditions_embeds, metadata_chunks):
        adjusted_id = f"{chunk_id}_{metadata_chunk['text']}".encode('ascii', 'ignore').decode('ascii')

        # Truncate the adjusted_id to fit within the length limit
        adjusted_id = adjusted_id[:512]

        index.upsert(vectors=[(adjusted_id, clinical_conditions_embed, metadata_chunk)])

    id_offset += len(batch)

In [None]:
# Printing the first few metadata entries to check
for i, metadata in enumerate(metadata_chunks[:9]):
    print(f"Metadata {i}: {metadata}")

In [None]:
index.describe_index_stats()

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

#device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

#set quantization 
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    #device_map='auto',
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

In [None]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.6,  # 'randomness' of outputs
    max_new_tokens=300,  # mex number of tokens to generate in the output
    repetition_penalty=1.0  # without this output begins repeating
)

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
from langchain.vectorstores import Pinecone

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [None]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()

)

In [None]:
# Initialize an empty list to store the generated responses
generated_responses = []

# Specify the number of descriptions to generate diagnosis for
num_descriptions_to_generate = 50

# Iterate over the dataset
for index, data in enumerate(test_dataset):
    if index >= num_descriptions_to_generate:
        break  
    description = data['clinical_conditions']
    generated_text = (
    "Medical Diagnosis Task: You are presented with a clinical description. "
    "Based on this description, identify the most appropriate medical category for diagnosis. Respond with one of the following categories, no explanations are needed. "
    "The categories are: \n"
    "1. Neoplasms\n"
    "2. Digestive System Diseases\n"
    "3. Nervous System Diseases\n"
    "4. Cardiovascular Diseases\n"
    "5. General Pathological Conditions\n\n"
    f"Clinical Description: '{description}'\n"
    "Diagnosis: "
)
    
    # Use the RAG pipeline with the generated_text
    response = rag_pipeline(generated_text)
    
    # Append the generated response (result) to the list
    generated_responses.append(response['result'])  # Extract 'result' from the response dictionary

# Now, generated_responses will contain only the results (generated summaries) for the first 5 titles in the dataset

actual_summaries = [test_dataset[i]['label'] for i in range(0, 50)]


In [None]:
generated_responses

In [None]:
actual_summaries

In [None]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Define the regular expression pattern
pattern = r'(1|2|3|4|5|[A-Za-z\s]+|[1-5][.: -]\s*[A-Za-z\s]+)'

# Define the label mapping
label_mapping = {
    '1': 'neoplasms',
    '2': 'digestive system diseases',
    '3': 'nervous system diseases',
    '4': 'cardiovascular diseases',
    '5': 'general pathological conditions',
    'neoplasms': 'neoplasms',
    'digestive system diseases': 'digestive system diseases',
    'nervous system diseases': 'nervous system diseases',
    'cardiovascular diseases': 'cardiovascular diseases',
    'general pathological conditions': 'general pathological conditions',
    '1. neoplasms': 'neoplasms',
    '2. digestive system diseases': 'digestive system diseases',
    '3. nervous system diseases': 'nervous system diseases',
    '4. cardiovascular diseases': 'cardiovascular diseases',
    '5. general pathological conditions': 'general pathological conditions',
    '1 - neoplasms': 'neoplasms',
    '2 - digestive system diseases': 'digestive system diseases',
    '3 - nervous system diseases': 'nervous system diseases',
    '4 - cardiovascular diseases': 'cardiovascular diseases',
    '5 - general pathological conditions': 'general pathological conditions',
    '1: neoplasms': 'neoplasms',
    '2: digestive system diseases': 'digestive system diseases',
    '3: nervous system diseases': 'nervous system diseases',
    '4: cardiovascular diseases': 'cardiovascular diseases',
    '5: general pathological conditions': 'general pathological conditions',
    '1 (Neoplasms)': 'neoplasms',
    '2 (Digestive System Diseases)': 'digestive system diseases',
    '3 (Nervous System Diseases)': 'nervous system diseases',
    '4 (cardiovascular diseases)': 'cardiovascular diseases',
    '5 (General Pathological Conditions)': 'general pathological conditions',
}

# Define label keywords
label_keywords = {
    'neoplasms': ['neoplasms'],
    'digestive system diseases': ['digestive system diseases'],
    'nervous system diseases': ['nervous system diseases'],
    'cardiovascular diseases': ['cardiovascular diseases'],
    'general pathological conditions': ['general pathological conditions'],
}

# Function to find label based on keywords
def find_label_based_on_keywords(text):
    for label, keywords in label_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                return label
    return None

# Extract relevant part from each output and map to actual labels
extracted_parts = []

for output in generated_responses:
    label_found = find_label_based_on_keywords(output)
    if label_found:
        extracted_parts.append(label_found)
    else:
        matches = re.findall(pattern, output)
        extracted_part = None
        for match in matches:
            match = match.strip()
            if match in label_mapping:
                extracted_part = match
                break
        if extracted_part:
            mapped_label = label_mapping[extracted_part]
            extracted_parts.append(mapped_label)
        else:
            extracted_parts.append("Unknown")

# Filter out "Unknown" responses from actual_labels and extracted_parts
filtered_actual_labels = [label for label, predicted in zip(actual_summaries, extracted_parts) if predicted != "Unknown"]
filtered_extracted_parts = [predicted for predicted in extracted_parts if predicted != "Unknown"]

# Ensure both lists are of the same length
if len(filtered_actual_labels) != len(filtered_extracted_parts):
    raise ValueError("The number of actual labels and predicted labels must be the same.")

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(filtered_actual_labels, filtered_extracted_parts)
precision = precision_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))
recall = recall_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))
f1 = f1_score(filtered_actual_labels, filtered_extracted_parts, average='weighted', labels=np.unique(filtered_extracted_parts))

# Print the calculated metrics
print("Accuracy:", accuracy)
print("Weighted Precision:", precision)
print("Weighted Recall:", recall)
print("Weighted F1 Score:", f1)


In [None]:
import matplotlib.pyplot as plt

# Labels for the metrics
labels = ['Accuracy', 'Weighted Precision', 'Weighted Recall', 'Weighted F1']

# Values for the metrics (replace with your actual calculated values)
values = [accuracy, precision, recall, f1]

# Print the calculated metrics
print("Metrics:")
for label, value in zip(labels, values):
    print(f"{label}: {value}")

# Create a bar plot
plt.bar(labels, values, color=['blue', 'green', 'orange', 'red'])

# Add labels and title
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Evaluation Metrics')

# Display the plot
plt.show()
