In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import json
import re
import textstat

#### **Define file paths**

In [2]:
rawdata_folder_path = "/gpfs/data/majorlab/biasaudit/data/raw_records/"  
edit_dist_ingredients = "/gpfs/data/majorlab/biasaudit/data/2024-11-19_edit_distance_ingredients.csv"

#### **Define functions**
Load all GPT generated summaries in the biasaudit>data folder into a dictionary. Takes folder name and number of files.

In [3]:
def read_json(folder_path, number_rows):
    data_list = []
    file_count = 0
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):  # Check if the file is a JSON
            file_path = os.path.join(folder_path, file_name)            
            try:
                # Open and load JSON content
                with open(file_path, 'r') as f:
                    data = json.load(f)  # Load JSON as a dictionary
                    if data:  # Check if data is not empty
                        data_list.append(data)  # Add the dictionary to the list
                    else:
                        print("Warning: {} is empty or invalid.".format(file_name))

            except json.JSONDecodeError as e:
                print("Error decoding {}: {e}".format(file_name))

            # Increment the file counter
            file_count += 1

            # Stop if 100 files are processed (optional)
            if file_count >= number_rows:
                break
                
    return data_list


Generate word count

In [4]:
def word_count(text):
    # Replace all types of newlines and extra spaces with a single space
    clean_text = re.sub(r'\s+', ' ', text).strip()  

    words = clean_text.split()

    return len(words)

Extract relevant key / value pairs from the list of dictionaries with all the summaries. Fields correspond to the following:
* csn = episode ID
* qX = full text of question 1-4 (answers generated by GPT)
* WC_QX = word count for questions 1-4
* PF_note = GPT generated free text note summarising admission
* FK_grade and FK_ease = Flesch Kincaid reading grade and reading ease

In [5]:
def shape_for_edit_dist(summary_dict):
    postprocessed = summary_dict.get('postprocessed', {})

    q1 = postprocessed.get('Q1', '')  
    q2 = postprocessed.get('Q2', '')  
    q3 = postprocessed.get('Q3', '')  
    q4 = postprocessed.get('Q4', '')
    combined_Q1_Q4 = " ".join(["What brought me to the hospital?", 
        q1, 'Why was I hospitalized?', q2, 'What happened in the hospital?',
        q3, 'What should I know after leaving the hospital?', q4])
    processed_note_text = summary_dict.get('processed_note_text')

    summary = {
        'csn': summary_dict.get('csn'),
        'q1': q1,
        'q2': q2,
        'q3': q3,
        'q4': q4,
        'PF_DC': combined_Q1_Q4, 
        'OG_DC': processed_note_text,
        'wc_Q1': word_count(q1),
        'wc_Q2': word_count(q2),
        'wc_Q3': word_count(q3),
        'wc_Q4': word_count(q4),
        'wc_PF_DC': word_count(combined_Q1_Q4),
        'wc_OG_DC': word_count(processed_note_text),
        'fk_ease_PF_DC': textstat.flesch_reading_ease(combined_Q1_Q4),
        'fk_grade_PF_DC': textstat.flesch_kincaid_grade(combined_Q1_Q4),
        'fk_ease_OG_DC': textstat.flesch_reading_ease(processed_note_text),
        'fk_grade_OG_DC': textstat.flesch_kincaid_grade(processed_note_text)
    }

    return summary

Apply a function to the list of summaries to generate a new list of dictionaries

In [6]:
def apply_to_dict(dict_list, function_name):
    new_summaries = []
    for summary in dict_list:
        new = function_name(summary)
        new_summaries.append(new)
    
    return new_summaries

Group EMR extracts of each line of note into each encounter

In [7]:
def groupby_csn_id(df):
    df = df.sort_values(by=['pat_enc_csn_id', 'text_line'], ascending=True)

    grouped_df = df.groupby('pat_enc_csn_id').agg(
        q1_exists=('q1_exists', 'max'),  # Use max to retain True if any row has True
        q2_exists=('q2_exists', 'max'),
        q3_exists=('q3_exists', 'max'),
        q4_exists=('q4_exists', 'max'),
        note_text_concat=('note_text', ' '.join)  # Concatenate all note_text for the group
    ).reset_index()
    
    return grouped_df


Extract the Q1-4 section of the DC summary

In [9]:
def extract_post_edits(text):
    pattern = r"A Simplified Guide(.*?)Visit Guide\."
    matches = re.findall(pattern, text, re.DOTALL)
    
    for match in matches:
        return(match.strip())

Trim superfluous text from EMR summaries

In [10]:
# Need to fix this 

def trim_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return None  

    phrases = ['to Your Hospital Stay',
               ': Translating Your Hospital Stay',
               'Make sure to read the full medication instructions printed below and in the NYU Langone Health After']
    
    for phrase in phrases:
        text = text.replace(phrase, "")
    
    text = " ".join(text.split())
    return text

FK reading grade and reading ease

In [8]:
def safe_flesch_reading_ease(text):
    try:
        return textstat.flesch_reading_ease(text)
    except:
        return None

def safe_flesch_kincaid_grade(text):
    try:
        return textstat.flesch_kincaid_grade(text)
    except:
        return None



Levenshtein edit distance

In [12]:

import Levenshtein as lev


def calculate_lev(row, col1, col2):
    val1 = row[col1]
    val2 = row[col2]
    
    if not isinstance(val1, str):
        val1 = ""  # Convert to empty string
    if not isinstance(val2, str):
        val2 = ""  # Convert to empty string
    
    # Calculate Levenshtein distance
    return lev.distance(val1, val2)

#### **Running functions**

Code to generate table of GPT generated text

In [9]:
# read json files into list of dictionaries
summaries = read_json(rawdata_folder_path, 3687)

# extract relevant key value pairs
new_sum = apply_to_dict(summaries, shape_for_edit_dist)

# make this into a dataframe, change the csn to a float not string
summaries_df = pd.DataFrame(new_sum)
summaries_df['csn'] = summaries_df['csn'].astype(float)

Code to extract relevant parts of summary from clinician edited version

In [15]:
# Upload csv as dataframe
edit_dist_ingredients = pd.read_csv(edit_dist_ingredients)

dist_csn = groupby_csn_id(edit_dist_ingredients)

dist_csn['note_for_edit_dist'] = dist_csn['note_text_concat'].apply(extract_post_edits)
dist_csn['csn'] = dist_csn['pat_enc_csn_id']

Merge dataframes together, drop unnecessary columns

In [16]:
merged = pd.merge(dist_csn, summaries_df, on='csn', how='left')
merged = merged.drop(['pat_enc_csn_id', 'q1_exists', 'q2_exists', 'q3_exists', 'q4_exists'], axis=1)
merged['trimmed_note'] = merged['note_for_edit_dist'].apply(trim_text)

Calculate FK reading grade and reading ease scores, Levenshtein edit distance

In [17]:
merged['trimmed_note_fk_ease'] = merged['trimmed_note'].apply(safe_flesch_reading_ease)
merged['trimmed_note_fk_grade'] = merged['trimmed_note'].apply(safe_flesch_kincaid_grade)
merged['Levenshtein_edit'] = merged.apply(calculate_lev, args=('combined_Q1_Q4', 'trimmed_note'), axis=1)


#### Extracting discharge instructions

In [11]:
summaries_df.to_csv('20250114_summaries.csv')