In [1]:
import os
import re
import json
import base64
import requests
import time
import urllib.parse
from dotenv import load_dotenv
import pandas as pd
import ast
import tqdm

In [12]:
load_dotenv('.env')

API_KEY = os.environ.get('VERSA_API_KEY')  # Match the environment variable name to the name you used in the .env file
API_VERSION = os.environ.get('VERSA_API_VERSION')
RESOURCE_ENDPOINT = os.environ.get('RESOURCE_ENDPOINT')

DEPLOYMENT_ID = 'gpt-4-turbo-128k'
API_VERSION = '2024-05-01-preview'

# These are configurable parameters for managing re-attempts for API calls
RETRY_SECS = 15  # Seconds between attempts
MAX_RETRIES = 5  # Max number of re-attempts

error_msg = "\nProvided your configuration parameters (API_KEY, API_VERSION, RESOURCE_ENDPOINT, deployment name) are valid, the majority of errors you may encounter with this code are attributable to temporary issues such as Azure server outages or other users who have triggered shared API rate limits for a given deployment. Please try again in a few minutes. However, if you receive a 401 Unauthorized access error, while your API key may have the correct length, most likely it is not a valid key for some other reason. In that event, please open a ticket with the Versa team at versa@ucsf.edu to review the key.\n"

def chat(prompt):
    url = f'{RESOURCE_ENDPOINT}/openai/deployments/{DEPLOYMENT_ID}/chat/completions?api-version={API_VERSION}'
    body = json.dumps({
        "seed": 1234,
        "messages": [{"role": "user", "content": prompt}]
    })
    headers = {'Content-Type': 'application/json', 'api-key': API_KEY}
    retries = 0
    while True:
        try:
            response = post_request(url, headers, body)
            output = json.loads(response.text).get('choices')[0].get('message').get('content')
            return output
            break
        except Exception as e:
            retries = exception_code(retries, DEPLOYMENT_ID, e)

# These two functions are helper functions
def post_request(url, headers, body):
    response = requests.post(url, headers=headers, data=body)
        
    response.raise_for_status()
    return response

def exception_code(retries, deployment_id, e):
    if retries >= MAX_RETRIES:
        print(f'Failed attempt {retries+1} of {MAX_RETRIES+1}.')
        print(error_msg)
        
        assert False, f"Test failed for deployment: {deployment_id}, Error received: {e}"
    else:
        print(f'Failed attempt {retries+1} of {MAX_RETRIES + 1}. Waiting {RETRY_SECS} secs before next attempt...')
        
    retries += 1
    time.sleep(RETRY_SECS)
    
    return retries

In [13]:
def generate_example(i):
     return '\nClinical History:\n'+ generate_summary(sample_data.iloc[i]) + '\nIndication:\n' + sample_histories[i]

def notes_to_prompt(notes):
    note_prompt = ''
    for i, note in enumerate(notes):
        note_prompt += f'\nNOTE {i+1}\n' + note[:1000] 
    return note_prompt
    
def generate_summary(row):
    note_prompt = ''
    notes = row['NOTE_TEXTS'][:10]
    for i, note in enumerate(notes):
        note_prompt += f'\nNOTE {i+1}\n' + note[:1000]   
    prompt = (
        "Summarize these clinical notes into one paragraph, noting patient age, gender, ethnicity.\n"+\
        "Also note relevant medical conditions, lab tests, procedures, and clinical history."+\
        "\nNotes:"+\
        note_prompt+\
        '\nSummary:\n'
    )  
    return chat(prompt)


In [4]:
data = pd.read_csv('data/processed_notes.csv')
data['NOTE_TEXTS'] = data['NOTE_TEXTS'].apply(ast.literal_eval)

In [5]:
sample_data = data.drop_duplicates(subset=['SUBJECT_ID']).dropna(subset=['RADIOLOGY_REPORT_MEDICAL_CONDITION', 'RADIOLOGY_REPORT_REASON_FOR_EXAM'])
sample_data['RADIOLOGY_REPORT_INDICATION'] = sample_data['RADIOLOGY_REPORT_MEDICAL_CONDITION'] + " " + sample_data['RADIOLOGY_REPORT_REASON_FOR_EXAM']
sample_data = sample_data[sample_data['NOTE_TEXTS'].apply(len) >= 5].reset_index(drop=True)

In [6]:
sample_data = sample_data[sample_data['RADIOLOGY_REPORT_DESCRIPTION'].str.contains('CT HEAD')].reset_index(drop=True)

In [10]:
# results = []
# for i in tqdm.tqdm(range(25)):
#     row = sample_data.iloc[i]
#     notes = notes_to_prompt(row['NOTE_TEXTS'][:15])
#     neurologic_conditions = chat("Identify the single most critical head-related medical condition. Example: brain tumor metastases\n" + notes)
#     print(neurologic_conditions)
#     clinical_notes_prompt = "Write a clinical indication (a one-sentence summary describing the patient's medical condition with gender, age, procedures done (s/p), diseases to rule out (r/o)) for a "+\
#     row['RADIOLOGY_REPORT_DESCRIPTION']+\
#     f"\nInclude the patient's pre-existing condition of {neurologic_conditions}\n"+\
#     '\nNotes:\n' + notes+ '\nIndication:\n'
#     generated_indication = chat(clinical_notes_prompt)
#     results.append({
#         'SUBJECT_ID': row['SUBJECT_ID'],
#         'HADM_ID': row['HADM_ID'],
#         'ORIGINAL_INDICATION': row['RADIOLOGY_REPORT_INDICATION'],
#         'GENERATED_INDICATION': generated_indication
#     })

In [15]:
results = []
for i in tqdm.tqdm(range(50)):
    row = sample_data.iloc[i]
    notes = notes_to_prompt(row['NOTE_TEXTS'])
    clinical_notes_prompt = "Write a clinical indication (a one-sentence summary less than 20 words describing the patient's medical condition with gender, age, relevant surgical procedures done, and conditions to rule out for a "+\
    row['RADIOLOGY_REPORT_DESCRIPTION']+\
    "\nYou may OPTIONALLY consider history of/rule out of stroke, trauma, seizures, bleed, intraventricular hemorrhage (IVH), tumor mass, intracranial process, interval change AS APPROPRIATE.\n"+\
    '\nNotes:\n' + notes+ '\nIndication:\n'
    generated_indication = chat(clinical_notes_prompt)
    results.append({
        'SUBJECT_ID': row['SUBJECT_ID'],
        'HADM_ID': row['HADM_ID'],
        'ORIGINAL_INDICATION': row['RADIOLOGY_REPORT_INDICATION'],
        'GENERATED_INDICATION': generated_indication
    })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:55<00:00,  2.31s/it]


In [20]:
!pip install datasets
import evaluate
rouge = evaluate.load('rouge')

results = pd.DataFrame(results)
results['ORIGINAL_INDICATION'] = results['ORIGINAL_INDICATION'].str.replace('No contraindications for IV contrast', '')
rouge_scores = rouge.compute(predictions=results['GENERATED_INDICATION'], references=results['ORIGINAL_INDICATION'])

print(rouge_scores)

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.65.0
    Uninstalling tqdm-4.65.0:
      Successfully uninstalled tqdm-4.65.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installe

In [31]:
i = 1

print('ORIGINAL')
print(results.iloc[i]['ORIGINAL_INDICATION'])
print('GENERATED')
print(results.iloc[i]['GENERATED_INDICATION'])

ORIGINAL
87 year old man with ich, from OH S/p stroke CONTRAINDICATIONS for IV CONTRAST: crea
GENERATED
60-year-old male with intraparenchymal hemorrhage, hypotension, and junctional bradycardia; assess progression/rule out new intracranial events.


In [33]:
for note in sample_data.iloc[i]['NOTE_TEXTS']:
    print(note)

NPN 5a-7a
Pt. admitted from EW at 5am with intraparenchymal bleed. (see FHPA).
pt responds to pain-withdraws, more on R than L.  Has been off all sedation since 5:30-was briefly on propofol for a line placement.  Was initially requiring nipride and labatelol for BP but has been hypotensive-90's since admission to MICU and off all vasodilators.  Also found to be in a junctional rhythm with rate 45-47.  Does have corneal reflex and PERL.  Plan for MRI today to further assess bleed.
ID: afebrile
Resp: CMV 50% 600x14 5 PEEP. Clear breath sounds.  no sputum
GI: NPO OGT connected to low continuous suction.  Hypoactive bowel sounds.
GU: Foley placed in EW-draining clear yellow urine.
Access: 2 peripheral IV's.  Aline placed in R radial.
Endo: fingerstick at 6am 327-8u given per sliding scale.
Social: Daughter arrived with patient and she believes pt. wife will be the spokesperson but this will be decided once the wife arrives.

Nursing Progress Note 0700-1900 hours:
NEURO: Pt s/p interparench

In [None]:
# risk factors in the past
# what treatments have they gotten
# different diseases 


In [None]:
# NEONATAL HEAD PORTABLE
# IVH or Intracranial anomaly
# PVL or periventricular leukomalacia
# IVH
# periventricular leukomalacia
# intracranial hemorrhage
# IVH
# Intracranial hemorrhage
# PVL or other abnormality
# intracranial hemorrhage 
# r/o pvl
# r/o IVH
# assess for ivh


# Bleeding in the brain
# Hydrocephalus
# periventricular leukomalacia --> brainmatter malforms --> hypoxic injury (US)
# 

# CT HEAD
# Assess for recent hemorrhage or infarction
# Assess for hemorrhage
# Subdural hematoma
# subdurals
# SDH
# infarct, hemorrhage
# subdural drain removal
# Acute intracranial processes
# Assess interval change from prior CT

# Bleeds
# Stroke, contrast, usually MRI
# Acute intracranial processes --> anything? --> Not that useful
# Fall/Trauma

# increase specificity/sensitivity 
# if everyone looks up notes --> time savings
# if no time to read notes --> specificity/sensitivity
# + generated history --> increase? specificty/sensitivyt decrease? time savings

In [None]:
# Plan

# RAG to select 5 notes for current generation

# Add to the prompt the type of study being ordered

In [None]:
# How long indications are (histogram) and statistics

In [1492]:
radiology_reports = data[data['CATEGORY'].str.contains('Radiology')]

In [1495]:
radiology_reports['DESCRIPTION'].value_counts()[:50]

CHEST (PORTABLE AP)                                169270
CHEST (PA & LAT)                                    43158
CT HEAD W/O CONTRAST                                34485
CHEST PORT. LINE PLACEMENT                          21596
PORTABLE ABDOMEN                                     8143
CHEST (PRE-OP PA & LAT)                              8064
CT CHEST W/CONTRAST                                  8001
CT ABDOMEN W/CONTRAST                                7304
MR HEAD W & W/O CONTRAST                             7062
CT CHEST W/O CONTRAST                                6745
CT C-SPINE W/O CONTRAST                              4840
CT ABDOMEN W/O CONTRAST                              4823
LIVER OR GALLBLADDER US (SINGLE ORGAN)               4761
CTA CHEST W&W/O C&RECONS, NON-CORONARY               4499
ABDOMEN (SUPINE & ERECT)                             4462
BABYGRAM (CHEST ONLY)                                4412
MR HEAD W/O CONTRAST                                 3603
BILAT LOWER EX