In [14]:
import dotenv
dotenv.load_dotenv()
import tensorflow as tf
import pandas as pd
from rouge_score import rouge_scorer
import evaluate
import os
import csv

In [2]:
df = pd.read_csv('./mtsamples.csv')
df['transcription'] = df.transcription.astype(str)
df

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
...,...,...,...,...,...,...
4994,4994,Patient having severe sinusitis about two to ...,Allergy / Immunology,Chronic Sinusitis,"HISTORY:, I had the pleasure of meeting and e...",
4995,4995,This is a 14-month-old baby boy Caucasian who...,Allergy / Immunology,Kawasaki Disease - Discharge Summary,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun..."
4996,4996,A female for a complete physical and follow u...,Allergy / Immunology,Followup on Asthma,"SUBJECTIVE: , This is a 42-year-old white fema...",
4997,4997,Mother states he has been wheezing and coughing.,Allergy / Immunology,Asthma in a 5-year-old,"CHIEF COMPLAINT: , This 5-year-old male presen...",


# ROUGE F1 Scores

In [3]:
rouge_calcer = rouge_scorer.RougeScorer([
    'rouge1', 
    'rouge2', 
    'rouge3',
    'rouge4',
    'rougeL',
    'rougeLsum'
], use_stemmer = True)

def calc_rouge(row):
    short_text, full_text = row['description'], row['transcription']        

    try:
        rouges = rouge_calcer.score(short_text, full_text)
        return rouges['rouge1'][2], rouges['rouge2'][2], rouges['rouge3'][2], rouges['rouge4'][2], rouges['rougeL'][2], rouges['rougeLsum'][2]
    except Exception as e:
        return 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [4]:
df[['rouge1_f1', 'rouge2_f1', 'rouge3_f1', 'rouge4_f1', 'rougeL_f1', 'rougeLsum_f1']] = df.apply(calc_rouge, axis = 1, result_type = 'expand')

# BERTScore

In [5]:
bertscore = evaluate.load("bertscore")

def calc_bertscore(row):
    short_text, full_text = row['description'], row['transcription'] 

    try:
        result = bertscore.compute(
                predictions = [short_text],
                references = [full_text],
                model_type = "microsoft/deberta-large-mnli",
                lang = "en")

        return result["f1"][0]
    except:
        return 0.0

In [6]:
df['bertscore_f1'] = df.apply(calc_bertscore, axis = 1)

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


# BLEURT

In [7]:
bleurt = evaluate.load("bleurt", config_name = "bleurt-tiny-128")

def calc_bleu(row):
    short_text, full_text = row['description'], row['transcription']

    with tf.device("/CPU:0"):
        try:
            result = bleurt.compute(
                    predictions = [short_text],
                    references = [full_text],
                    )
            return result["scores"][0]
        except:
            return 0.0

INFO:tensorflow:Reading checkpoint /mnt/data/hf/metrics/bleurt/bleurt-tiny-128/downloads/extracted/599cd3ff6a3bbad54e145d867ccea405bb98c2b832fb29b50fb02089a1026530/bleurt-tiny-128.


INFO:tensorflow:Reading checkpoint /mnt/data/hf/metrics/bleurt/bleurt-tiny-128/downloads/extracted/599cd3ff6a3bbad54e145d867ccea405bb98c2b832fb29b50fb02089a1026530/bleurt-tiny-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.
I0000 00:00:1751856942.698179 2136909 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8279 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:02:00.0, compute capability: 8.9


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [13]:
df['bleurt'] = df.apply(calc_bleu, axis = 1)

# Save

In [9]:
df.to_csv('mtsamples_with_metrics.csv', index = False)

#  HHEM 2.1 model from Vectara (WIP)

In [16]:
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveHallucinationScorerV1

hallucination_scorer = WeaveHallucinationScorerV1()

def hallucination_score(query, context, output):
    try:
        result = hallucination_scorer.score(
            query = query,
            context = context,
            output = output
        )
    
        return result.metadata['score']
    except:
        return 1.0

  check_cuda(self.device)
[34m[1mwandb[0m: Downloading large artifact hallucination_hhem_scorer:v0, 421.31MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:1.0 (436.1MB/s)
Device set to use cpu


In [17]:
df_llama = pd.read_csv('mtsamples_with_llama.csv')

In [None]:
df_llama['med-llama-summary-hallucination-score'] = df_llama.apply(lambda x: hallucination_score(x.transcription, x.description, x['med-llama-summary']), axis = 1)

Token indices sequence length is longer than the specified maximum sequence length for this model (711 > 512). Running this sequence through the model will result in indexing errors


In [None]:
df_llama.to_csv('mtsamples_with_llama_hhem_score.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)