In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import json
import re

In [None]:
! pip install openai==1.55.0

In [6]:
import sys
shared_path = '/gpfs/data/majorlab/biasaudit'
sys.path.insert(0, shared_path)

In [7]:
import yaml

with open(os.path.join(shared_path, "gpt_config.yml"), "r") as cfg:
    config = yaml.safe_load(cfg.read())

api_key = config.get('apikey')

In [8]:
import requests

#test

payload = json.dumps({"temperature": 0, 
                      "top_p" : 0.95,
                      "max_tokens": 800, 
                      "messages": [
                          {"role": "system", 
                           "content": "You are a helpful assistant."}, 
                          {"role": "user", 
                           "content": "Select a random number between 1 and 100,000"}] }) 
  
headers = {"apikey": config.get('kong').get('apikey'),
           'Content-Type': 'application/json'} 
url = "https://genai-api.prod1.nyumc.org/gpt-4o/v1.0.0/chat/completions" 
  
res = requests.post(url, headers=headers, data=payload) 

if res.status_code != 200:
    print('error')
elif not res.json().get('choices'):
    print('bad payload')
else:
    if len(res.json().get('choices')) > 1:
        print('more than one result')
    for c in res.json().get('choices'):
        result = c.get('message').get('content')
        print(result)

Sure! Here is a random number between 1 and 100,000: 47,283.


In [12]:
# Load csv

to_counter = pd.read_csv('20250107_cleaned_perturb.csv')

In [14]:
# API call to perturb gender

def gender_counter(DC_text):
    payload = json.dumps({"temperature": 0, 
                      "top_p" : 0.95,
                      "max_tokens": 800, 
                      "messages": [
                          {"role": "system", 
                           "content": "You are a helpful and knowledgable medical assistant"}, 
                          {"role": "user", 
                           "content": f"The following is a discharge summary. Please change the gender of the patient to the opposite gender. {DC_text}"}]}) 
  
    headers = {"apikey": config.get('kong').get('apikey'),
               'Content-Type': 'application/json'} 
    url = "https://genai-api.prod1.nyumc.org/gpt-4o/v1.0.0/chat/completions" 
      
    res = requests.post(url, headers=headers, data=payload) 
    
    if res.status_code != 200:
        print('error')
    elif not res.json().get('choices'):
        print('bad payload')
    else:
        if len(res.json().get('choices')) > 1:
            print('more than one result')
        for c in res.json().get('choices'):
            result = c.get('message').get('content')
            return result


In [15]:
to_counter_test.to_csv('20250109_gender_counter_test.csv')
to_counter_test['gender_counter'] = to_counter_test['PF_note'].apply(gender_counter)
to_counter['gender_counter'] = to_counter['PF_note'].apply(gender_counter)
to_counter.to_csv('20250109_gender_counter.csv')

#### Perturb outputs

In [35]:
# Perturb race

def perturb_race(row, target_col):
    # Extract the necessary values from the row
    age = row['ageinyears']
    gender = row['sex']
    race = row[target_col]
    dc_string = row['updated_string']
    
    # Check if the race value is non-empty and insert it as a descriptor
    if race != "Not listed" or race != "Not Hispanic":
        descriptor = f"{race} "
    else:
        descriptor = ""

    # Update the string using age, descriptor, and gender
    updated_dc_string = dc_string.format(
        age=age,
        descriptor=descriptor.strip(),  # Ensure there's no extra space
        gender=gender.lower()
    )
    
    return updated_dc_string


In [22]:
# Perturbing race with above function
to_perturb = pd.read_csv('20250107_cleaned_perturb.csv')
to_perturb['race_1_change'] = to_perturb.apply(perturb_race, axis=1, target_col='race_1')
to_perturb['race_2_change'] = to_perturb.apply(perturb_race, axis=1, target_col='race_2')
to_perturb.to_csv('20250109_pertub_test.csv')

#### Extract and perturb gender

In [54]:
# Extract age and gender
def extract_and_insert_template(text):
    pattern = r'(?<![\d/])(\d{2})[\s-]*(?:y\.o\.|yo|year[\s-]?old)?[\s-]*(F|M|[a-zA-Z]{4,})'
    matches = re.finditer(pattern, text)
    
    extracted = []
    offset = 0  # To track changes in text length after replacements
    
    for match in matches:
        age, gender = match.groups()
        start, end = match.span()
        
        # Create a template string for insertion
        template = f"{{age}} year-old {{descriptor}} {{gender}}"
        
        # Replace the matched string with the template
        text = text[:start + offset] + template + text[end + offset:]
        
        # Calculate the offset for future replacements
        offset += len(template) - (end - start)
        
        # Store extracted data
        extracted.append({
            'age': age,
            'gender': gender,
            'updated_string': text
        })

            # Ensure always returning 3 values (even if no match)
    if extracted:
        return extracted[-1]['age'], extracted[-1]['gender'], extracted[-1]['updated_string']
    else:
        return None, None, text  # Return original text if no match


In [55]:
merged[['age', 'gender', 'updated_string']] = merged['PF_note'].apply(
    lambda x: pd.Series(extract_and_insert_template(x))
)

In [57]:
merged.to_csv('test_merge.csv')