In [3]:
pip install openai

Collecting openai
  Downloading openai-1.95.1-py3-none-any.whl.metadata (29 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.95.1-py3-none-any.whl (755 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.6/755.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (352 kB)
Installing collected packages: jiter, openai
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [openai]2m1/2[0m [openai]
Successfully installed jiter-0.10.0 openai-1.95.1
Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
from openai import OpenAI

In [10]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install -q -U google-genai

Note: you may need to restart the kernel to use updated packages.


In [24]:
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types

# Load environment variables from .env file
load_dotenv(override=True)

# Get the API key from the environment variable
api_key = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=api_key)

In [25]:
# Function to generate evaluation using Gemini
def generate_evaluation(prompt):
    import time
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    temperature=0.3,
                    top_p=1.0,
                    max_output_tokens=100,
                    thinking_config=types.ThinkingConfig(thinking_budget=0)
                )
            )
            return response.text
        except Exception as e:
            if "RESOURCE_EXHAUSTED" in str(e) and attempt < max_retries - 1:
                print(f"Rate limit hit. Retrying after 45 seconds... (attempt {attempt+1})")
                time.sleep(45)
            else:
                print(f"Error generating evaluation: {e}")
                return "Score: [0]"

# CSV files to process
csv_files = ["tf_results.csv"]

In [26]:
from pathlib import Path

# Prepare a dictionary to store results for each model
all_results = {}

for file_name in csv_files:
    file_path = f"{file_name}"
    
    # Check if file exists
    if not Path(file_path).is_file():
        print(f"File not found: {file_name}")
        continue
    
    # Load the CSV for the current model
    try:
        df = pd.read_csv(file_path)
        
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        continue
    
    # Verify required columns
    required_columns = ["Generated Letter", "Reference Letter"]
    if not all(col in df.columns for col in required_columns):
        print(f"Missing required columns in {file_name}: {required_columns}")
        continue
    
    # Prepare lists for storing evaluation results
    scores = []

    df_subset = df.head(2)
    
    for i, row in df_subset.iterrows():
        generated_letter = row["Generated Letter"]
        reference_letter = row["Reference Letter"]
        
        # Prepare the evaluation prompt
        prompt = f"""
        Evaluate the following Discharge Summaries:

        Reference Discharge Summary:
        {reference_letter}

        Generated SOAP:
        {generated_letter}

        Rate the quality of the generated clinical letter on a scale of 0-10 based on the following criteria:
        - Completeness: How much of the necessary information is included (0.25 weight)
        - Correctness: Medical accuracy of the content (0.35 weight)
        - Organization: Structure follows clinical letter format (0.20 weight)
        - Clinical Relevance: Relevance of the content to clinical practice (0.20 weight)

        Provide only the score from 0 to 10 based on the weighted evaluation.
        Score: [ ]
        """
        
        # Generate evaluation using Groq API
        generated_content = generate_evaluation(prompt)
        # print(f"Row {i+1}/{len(df)} in {file_name}: {generated_content}")

        try:
            # Extract score from the model's response
            score_line = generated_content.split("Score:")[1].split("\n")[0].strip()
            score = float(score_line.replace("[", "").replace("]", ""))
            scores.append(score)
        except Exception as e:
            print(f"Failed to parse output at row {i+1} in {file_name}: {e}")
            scores.append(0.0)
    
    # Save the judged results for the current model
    df.loc[df_subset.index, "Judge Score"] = scores

    # Store the evaluated dataframe in the dictionary
    all_results[file_name] = df

In [27]:
# Save all judged results for each model
for file_name, result_df in all_results.items():
    output_file = f"judged_{file_name}"
    result_df.to_csv(output_file, index=False)
    print(f"Evaluation for {file_name} completed and saved as {output_file}!")

print("All evaluations completed!")

Evaluation for tf_results.csv completed and saved as judged_tf_results.csv!
All evaluations completed!


In [30]:
# Read the Scores of each fine tuning technique
facebook_bart_large_transfer_learning = pd.read_csv("judged_tf_results.csv")

In [31]:
# Calculate the mean judge score
facebook_bart_large_transfer_learning["Judge Score"].mean()

np.float64(5.55)