In [15]:
import dotenv
dotenv.load_dotenv()
import tensorflow as tf
import pandas as pd
from rouge_score import rouge_scorer
import evaluate
import os
import csv

In [16]:
test_model_name = 'biobart'

In [17]:
df = pd.read_csv(f'./data/mtsamples_with_{test_model_name}.csv')
df['transcription'] = df.transcription.astype(str)
df

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,med-biobart-summary
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",SUBJECTIVITY: This 25 year old white female....
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",PASTMEDICAL HOSPITAL: He is a retired male. he...
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...","HISTORY of PRESENTILLNESS, among other bilater..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...","2-D R-MODE and M-Mode: , .1. : Left ventricu..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",1. . There appears the left ventricular cham...
...,...,...,...,...,...,...,...
4994,4994,Patient having severe sinusitis about two to ...,Allergy / Immunology,Chronic Sinusitis,"HISTORY:, I had the pleasure of meeting and e...",,"HISTORY:[1] . Regular to patient.ART time.2, P..."
4995,4995,This is a 14-month-old baby boy Caucasian who...,Allergy / Immunology,Kawasaki Disease - Discharge Summary,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun...","ADMITTING DIMENTOSIS:- Kawasaki Disease., Kaw..."
4996,4996,A female for a complete physical and follow u...,Allergy / Immunology,Followup on Asthma,"SUBJECTIVE: , This is a 42-year-old white fema...",,"SUBJECTIV: ,This is a 36- year-old female who..."
4997,4997,Mother states he has been wheezing and coughing.,Allergy / Immunology,Asthma in a 5-year-old,"CHIEF COMPLAINT: , This 5-year-old male presen...",,CHIEF COMPLEINT: : This 5.5-year old male pre...


# ROUGE F1 Scores

An older-fashioned statistical set of metrics for similarity, typically used for summarizations.

In [18]:
rouge_calcer = rouge_scorer.RougeScorer([
    'rougeL'
], use_stemmer = True)

def calc_rouge(short_text, full_text):
    try:
        rouges = rouge_calcer.score(short_text, full_text)
        return rouges['rougeL'][2]
    except Exception as e:
        return 0.0

def calc_rouge_pair(row):
    return calc_rouge(row['description'], row['transcription']), calc_rouge(row[f'med-{test_model_name}-summary'], row['transcription'])

In [19]:
df[['rougeL_f1_source', 'rougeL_f1_dest']] = df.apply(calc_rouge_pair, axis = 1, result_type = 'expand')

# BERTScore

Uses embeddings from the BERT transformer model to judge similarity.

In [20]:
bertscore = evaluate.load("bertscore")

def calc_bertscore(short_text, full_text):
    try:
        result = bertscore.compute(
                predictions = [short_text],
                references = [full_text],
                model_type = "microsoft/deberta-large-mnli",
                lang = "en")

        return result["f1"][0]
    except:
        return 0.0

def calc_bertscore_pair(row):
    return pd.Series([calc_bertscore(row['description'], row['transcription']), calc_bertscore(row[f'med-{test_model_name}-summary'], row['transcription'])])

In [None]:
df[['bertscore_f1_source', 'bertscore_f1_dest']] = df.apply(calc_bertscore_pair, axis = 1)

# BLEURT

A regression model based on BERT.

In [None]:
bleurt = evaluate.load("bleurt", config_name = "bleurt-tiny-128")

def calc_bleu(short_text, full_text):
    # this was slower on GPU, I think it doesn't paralleize well because of how small the model is
    with tf.device("/CPU:0"):
        try:
            result = bleurt.compute(
                    predictions = [short_text],
                    references = [full_text],
                    )
            return result["scores"][0]
        except:
            return 0.0

def calc_bleu_pair(row):
    return pd.Series([calc_bleu(row['description'], row['transcription']), calc_bleu(row[f'med-{test_model_name}-summary'], row['transcription'])])

In [None]:
df[['bleurt_source', 'bleurt_dest']] = df.apply(calc_bleu_pair, axis = 1)

#  HHEM 2.1 model from Vectara (WIP)

Languaged-model-based system for detecting hallucinations in other LM operations.

In [None]:
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveHallucinationScorerV1

hallucination_scorer = WeaveHallucinationScorerV1(device = 'cuda')

def hallucination_score(query, context, output):
    try:
        result = hallucination_scorer.score(
            query = query,
            context = context,
            output = output,            
        )
    
        return result.metadata['score']
    except:
        return 1.0

In [None]:
df[f'hallucination-score'] = df.apply(lambda x: hallucination_score(x.transcription, x.description, x[f'med-{test_model_name}-summary']), axis = 1)

# Coherence

A fine-tuned deberta-small-long-nli Small Language Model that ensures the writing doesn't contradict itself. 

In [None]:
import tensorflow as tf
os.environ["USE_TF"] = "0"
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveCoherenceScorerV1

coherence_scorer = WeaveCoherenceScorerV1(device = 'cuda')

def coherence_score(query, output):
    try:
        result = coherence_scorer.score(
            query = query,
            output = output
        )
    
        return result.metadata['score']
    except:
        return 0.0

In [None]:
df[f'coherence-score_source'] = df.apply(lambda x: coherence_score(x.transcription, x['description']), axis = 1)
df[f'coherence-score_dest'] = df.apply(lambda x: coherence_score(x.transcription, x[f'model-summary']), axis = 1)

# Save

In [None]:
df.to_csv(f'./data/mtsamples_with_{test_model_name}_model_scores.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)