In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
print("Visible:", torch.cuda.device_count())

Visible: 1


In [3]:
print("Using device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0
Device name: NVIDIA RTX A6000


In [32]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

# Get the API key from the environment variable
api_key = os.getenv("GROQ_API_KEY")

In [8]:
from groq import Groq

client = Groq(
    api_key=api_key,
)

# Function to generate evaluation using Groq API
def generate_evaluation(prompt):
    try:
        # Call Groq API with Llama 3 70B
        chat_completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,  # Reduced from 500 for efficiency
            temperature=0.3,
            top_p=1.0
        )
        # Extract generated text
        generated_text = chat_completion.choices[0].message.content
        return generated_text
    except Exception as e:
        print(f"Error generating evaluation: {e}")
        return "Score: [0]"

In [14]:
csv_files1 =  ["prefix-tuning-results.csv"]

In [26]:
csv_files2 = ["transfer-learning-results.csv"]

In [27]:
csv_file3 = ["lora-finetuned-results-new.csv", "adaptive-learning-results.csv", "full-paramater-generated-results.csv"]

In [16]:
import pandas as pd
from groq import Groq
from pathlib import Path
import os

In [19]:
# Prepare a dictionary to store results for each model
all_results = {}

for file_name in csv_files1:
    file_path = f"{file_name}"
    
    # Check if file exists
    if not Path(file_path).is_file():
        print(f"File not found: {file_name}")
        continue
    
    # Load the CSV for the current model
    try:
        df = pd.read_csv(file_path)
        df = df.head(50)
        
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        continue
    
    # Verify required columns
    
    required_columns = ["generated_soap", "reference_soap"]
    if not all(col in df.columns for col in required_columns):
        print(f"Missing required columns in {file_name}: {required_columns}")
        continue
    
    # Prepare lists for storing evaluation results
    scores = []
    
    for i, row in df.iterrows():
        generated_soap = row["generated_soap"]
        reference_soap = row["reference_soap"]
        
        # Prepare the evaluation prompt
        prompt = f"""
        Evaluate the following SOAP notes:

        Reference SOAP:
        {reference_soap}

        Generated SOAP:
        {generated_soap}

        Rate the quality of the generated SOAP note on a scale of 0-10 based on the following criteria:
        - Completeness: How much of the necessary information is included (0.25 weight)
        - Correctness: Medical accuracy of the content (0.35 weight)
        - Organization: Structure follows SOAP format (0.20 weight)
        - Clinical Relevance: Relevance of the content to clinical practice (0.20 weight)

        Provide only the score from 0 to 10 based on the weighted evaluation.
        Score: [ ]
        """
        
        # Generate evaluation using Groq API
        generated_content = generate_evaluation(prompt)

        try:
            # Extract score from the model's response
            score_line = generated_content.split("Score:")[1].split("\n")[0].strip()
            score = float(score_line.replace("[", "").replace("]", ""))
            scores.append(score)
        except Exception as e:
            print(f"Failed to parse output at row {i+1} in {file_name}: {e}")
            scores.append(0.0)
    
    # Save the judged results for the current model
    df["Judge Score"] = scores

    # Store the evaluated dataframe in the dictionary
    all_results[file_name] = df

In [20]:
# Save all judged results for each model
for file_name, result_df in all_results.items():
    output_file = f"judged_{file_name}"
    result_df.to_csv(output_file, index=False)
    print(f"Evaluation for {file_name} completed and saved as {output_file}!")

print("All evaluations completed!")

Evaluation for prefix-tuning-results.csv completed and saved as judged_prefix-tuning-results.csv!
All evaluations completed!


In [21]:
llama_3B_prefix_tuning=pd.read_csv("judged_prefix-tuning-results.csv")

In [22]:
avg1 = llama_3B_prefix_tuning["Judge Score"].mean()

In [23]:
print(avg1)

7.968


In [29]:
# Prepare a dictionary to store results for each model
all_results = {}

for file_name in csv_files2:
    file_path = f"{file_name}"
    
    # Check if file exists
    if not Path(file_path).is_file():
        print(f"File not found: {file_name}")
        continue
    
    # Load the CSV for the current model
    try:
        df = pd.read_csv(file_path)
        df = df.head(50)
        
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        continue
    
    # Verify required columns
    
    required_columns = ["Generated SOAP", "Reference SOAP"]
    if not all(col in df.columns for col in required_columns):
        print(f"Missing required columns in {file_name}: {required_columns}")
        continue
    
    # Prepare lists for storing evaluation results
    scores = []
    
    for i, row in df.iterrows():
        generated_soap = row["Generated SOAP"]
        reference_soap = row["Reference SOAP"]
        
        # Prepare the evaluation prompt
        prompt = f"""
        Evaluate the following SOAP notes:

        Reference SOAP:
        {reference_soap}

        Generated SOAP:
        {generated_soap}

        Rate the quality of the generated SOAP note on a scale of 0-10 based on the following criteria:
        - Completeness: How much of the necessary information is included (0.25 weight)
        - Correctness: Medical accuracy of the content (0.35 weight)
        - Organization: Structure follows SOAP format (0.20 weight)
        - Clinical Relevance: Relevance of the content to clinical practice (0.20 weight)

        Provide only the score from 0 to 10 based on the weighted evaluation.
        Score: [ ]
        """
        
        # Generate evaluation using Groq API
        generated_content = generate_evaluation(prompt)

        try:
            # Extract score from the model's response
            score_line = generated_content.split("Score:")[1].split("\n")[0].strip()
            score = float(score_line.replace("[", "").replace("]", ""))
            scores.append(score)
        except Exception as e:
            print(f"Failed to parse output at row {i+1} in {file_name}: {e}")
            scores.append(0.0)
    
    # Save the judged results for the current model
    df["Judge Score"] = scores

    # Store the evaluated dataframe in the dictionary
    all_results[file_name] = df

In [30]:
# Save all judged results for each model
for file_name, result_df in all_results.items():
    output_file = f"judged_{file_name}"
    result_df.to_csv(output_file, index=False)
    print(f"Evaluation for {file_name} completed and saved as {output_file}!")

print("All evaluations completed!")

Evaluation for transfer-learning-results.csv completed and saved as judged_transfer-learning-results.csv!
All evaluations completed!


In [31]:
bart_large_transfer_learning=pd.read_csv("judged_transfer-learning-results.csv")
avg2 = bart_large_transfer_learning["Judge Score"].mean()
print(avg2)

6.325
