In [1]:
import dotenv
dotenv.load_dotenv()
import tensorflow as tf
import pandas as pd
from rouge_score import rouge_scorer
import evaluate
import os
import csv

2025-08-09 15:51:01.136126: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-09 15:51:01.166651: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-09 15:51:01.166675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-09 15:51:01.167492: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-09 15:51:01.172802: I tensorflow/core/platform/cpu_feature_guar

# Evaluation

## ROUGE F1 Scores

An older-fashioned statistical set of metrics for similarity, typically used for summarizations.

In [2]:
rouge_calcer = rouge_scorer.RougeScorer([
    'rougeL'
], use_stemmer = True)

def calc_rouge(short_text, full_text):
    try:
        rouges = rouge_calcer.score(short_text, full_text)
        return rouges['rougeL'][2]
    except Exception as e:
        return 0.0

def calc_rouge_pair(row):
    return calc_rouge(row['description'], row['transcription']), calc_rouge(row[f'model-summary'], row['transcription'])

## BERTScore

Uses embeddings from the BERT transformer model to judge similarity.

In [3]:
bertscore = evaluate.load("bertscore")

def calc_bertscore(short_text, full_text):
    try:
        result = bertscore.compute(
                predictions = [short_text],
                references = [full_text],
                model_type = "microsoft/deberta-large-mnli",
                lang = "en",
                device = "cuda:0"
        )

        return result["f1"][0]
    except:
        return 0.0

def calc_bertscore_pair(row):
    return pd.Series([calc_bertscore(row['description'], row['transcription']), calc_bertscore(row[f'model-summary'], row['transcription'])])

## BLEURT

A regression model based on BERT.

In [4]:
bleurt = evaluate.load("bleurt", config_name = "bleurt-tiny-128")

def calc_bleu(short_text, full_text):
    # this was slower on GPU, I think it doesn't paralleize well because of how small the model is
    with tf.device("/CPU:0"):
        try:
            result = bleurt.compute(
                    predictions = [short_text],
                    references = [full_text],
                    )
            return result["scores"][0]
        except:
            return 0.0

def calc_bleu_pair(row):
    return pd.Series([calc_bleu(row['description'], row['transcription']), calc_bleu(row[f'model-summary'], row['transcription'])])

INFO:tensorflow:Reading checkpoint /mnt/data/hf/metrics/bleurt/bleurt-tiny-128/downloads/extracted/599cd3ff6a3bbad54e145d867ccea405bb98c2b832fb29b50fb02089a1026530/bleurt-tiny-128.


INFO:tensorflow:Reading checkpoint /mnt/data/hf/metrics/bleurt/bleurt-tiny-128/downloads/extracted/599cd3ff6a3bbad54e145d867ccea405bb98c2b832fb29b50fb02089a1026530/bleurt-tiny-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.
2025-08-09 15:51:06.563567: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-08-09 15:51:06.564449: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


## HHEM 2.1 model from Vectara (WIP)

Languaged-model-based system for detecting hallucinations in other LM operations.

In [5]:
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveHallucinationScorerV1

hallucination_scorer = WeaveHallucinationScorerV1(device = 'cuda:0')

def hallucination_score(query, context, output):
    try:
        result = hallucination_scorer.score(
            query = query,
            context = context,
            output = output,            
        )
    
        return result.metadata['score']
    except:
        return 1.0

[34m[1mwandb[0m: Downloading large artifact hallucination_hhem_scorer:v0, 421.31MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:1.0 (411.0MB/s)
Device set to use cuda:0


## Coherence

A fine-tuned deberta-small-long-nli Small Language Model that ensures the writing doesn't contradict itself. 

In [6]:
import tensorflow as tf
os.environ["USE_TF"] = "0"
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveCoherenceScorerV1

coherence_scorer = WeaveCoherenceScorerV1(device = 'cuda:0')

def coherence_score(query, output):
    try:
        result = coherence_scorer.score(
            query = query,
            output = output
        )
    
        return result.metadata['score']
    except:
        return 0.0

[34m[1mwandb[0m: Downloading large artifact coherence_scorer:v0, 549.59MB. 21 files... 
[34m[1mwandb[0m:   21 of 21 files downloaded.  
Done. 0:0:1.1 (489.8MB/s)
Device set to use cuda:0


## Calculate and Save

In [7]:
model_names = [
    'biobart',
    'gemma',
    'llama',    
]

for test_model_name in model_names:
    df = pd.read_csv(f'./data/mtsamples_with_{test_model_name}.csv')
    df['transcription'] = df.transcription.astype(str)

    df[['rougeL_f1_source', 'rougeL_f1_dest']] = df.apply(calc_rouge_pair, axis = 1, result_type = 'expand')
    df[['bertscore_f1_source', 'bertscore_f1_dest']] = df.apply(calc_bertscore_pair, axis = 1)
    df[['bleurt_source', 'bleurt_dest']] = df.apply(calc_bleu_pair, axis = 1)
    df['hallucination-score'] = df.apply(lambda x: hallucination_score(x.transcription, x.description, x['model-summary']), axis = 1)
    df['coherence-score_source'] = df.apply(lambda x: coherence_score(x.transcription, x['description']), axis = 1)
    df['coherence-score_dest'] = df.apply(lambda x: coherence_score(x.transcription, x['model-summary']), axis = 1)

    # Save this model's stats to disk
    df.to_csv(f'./data/mtsamples_with_{test_model_name}_model_scores.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)


 (subsequent messages of this type will be suppressed)
Token indices sequence length is longer than the specified maximum sequence length for this model (711 > 512). Running this sequence through the model will result in indexing errors
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


---

# Aggregate

## Load the metrics for all models

In [8]:
df = pd.read_csv('./data/mtsamples_with_biobart_model_scores.csv')
df = pd.concat([df, pd.read_csv('./data/mtsamples_with_gemma_model_scores.csv')], ignore_index = True)
df = pd.concat([df, pd.read_csv('./data/mtsamples_with_llama_model_scores.csv')], ignore_index = True)

## Data exploration

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,model-summary,model-name,rougeL_f1_source,rougeL_f1_dest,bertscore_f1_source,bertscore_f1_dest,bleurt_source,bleurt_dest,hallucination-score,coherence-score_source,coherence-score_dest
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",SUBJECTIVITY: This 25 year old white female....,biobart,0.086957,0.1,0.460195,0.501567,-0.578639,-0.751127,0.837365,0.868146,0.681641
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",PASTMEDICAL HOSPITAL: He is a retired male. he...,biobart,0.005495,0.046875,0.390673,0.470543,-0.645709,-0.409956,0.91399,0.32524,0.438955
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...","HISTORY of PRESENTILLNESS, among other bilater...",biobart,0.01087,0.026525,0.408516,0.419701,-0.911012,-0.542565,0.319041,0.836381,0.324871
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...","2-D R-MODE and M-Mode: , .1. : Left ventricu...",biobart,0.121951,0.28,0.449169,0.599963,-0.929891,-0.289554,0.707408,0.7688,0.369644
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",1. . There appears the left ventricular cham...,biobart,0.008197,0.11985,0.390977,0.553946,-0.719434,-0.249013,0.709367,0.263281,0.431459


In [10]:
df.groupby('model-name').count().iloc[:, 0]

model-name
biobart      4999
med-gemma    4999
med-llama    4999
Name: Unnamed: 0, dtype: int64

# Metrics for the human-entered summaries

the source values are the same for all models, so just pick one and get the averages

In [11]:
df_metrics_human =  df[df['model-name'] == 'biobart'][['rougeL_f1_source', 'bertscore_f1_source', 'bleurt_source', 'coherence-score_source']].mean()
df_metrics_human

rougeL_f1_source          0.088913
bertscore_f1_source       0.485023
bleurt_source            -0.523850
coherence-score_source    0.574467
dtype: float64

# Metrics for the machine-generated summaries for each model

In [12]:
df_metrics_models = df.groupby('model-name')[['rougeL_f1_dest', 'bertscore_f1_dest', 'bleurt_dest', 'hallucination-score', 'coherence-score_dest']].mean()
df_metrics_models

Unnamed: 0_level_0,rougeL_f1_dest,bertscore_f1_dest,bleurt_dest,hallucination-score,coherence-score_dest
model-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
biobart,0.070512,0.463525,-0.585942,0.438506,0.417923
med-gemma,0.255567,0.580905,-0.437547,0.365952,0.738806
med-llama,0.097805,0.505136,-0.572614,0.32153,0.781948


# Comparison of machine-generated and human-generated

In [13]:
pd.concat([
        df_metrics_models.rougeL_f1_dest - df_metrics_human.rougeL_f1_source, 
        df_metrics_models.bertscore_f1_dest - df_metrics_human.bertscore_f1_source,
        df_metrics_models.bleurt_dest - df_metrics_human.bleurt_source,
        df_metrics_models['coherence-score_dest'] - df_metrics_human['coherence-score_source'],
    
    ], axis = 1).rename(columns = {'rougeL_f1_dest': 'rougeL_f1_diff', 
                                   'bertscore_f1_dest': 'bertscore_f1_diff', 
                                   'bleurt_dest': 'bleurt_diff', 
                                   'coherence-score_dest': 'coherence-score_diff'})
    

Unnamed: 0_level_0,rougeL_f1_diff,bertscore_f1_diff,bleurt_diff,coherence-score_diff
model-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
biobart,-0.0184,-0.021498,-0.062092,-0.156544
med-gemma,0.166655,0.095883,0.086304,0.164339
med-llama,0.008892,0.020113,-0.048764,0.207481
