In [1]:
import math
import numpy as np
import pandas as pd
from scipy.stats import norm
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import copy
import matplotlib
import time
import re
import random
from sklearn.model_selection import train_test_split

In [297]:
df = pd.read_csv('data/mimic-cxr-2.0.0-chexpert.csv')

In [299]:
l = ['Atelectasis', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Opacity', 'Support Devices',
     'Edema', 'Cardiomegaly', 'Pleural Effusion', 'Pneumonia']
df1 = pd.DataFrame()
s = set(range(len(l)))
for i in range(len(l)):
    res = df[(df[l[i]] == 1)]
    diff = s-{i}
    for j in diff:
        res = res[((res[l[j]] == 0) | (res[l[j]].isna()))]
    
    df1 = pd.concat([df1, res], ignore_index=False)
    
for i in l:
    print(df1[i].sum())
# print(len(df1))


res = df.copy()
for i in l:
    res = res[(res[i] != -1) & (res[i] != 1)]

print(len(res))

4637.0
841.0
1400.0
4792.0
13301.0
2345.0
6225.0
4562.0
2973.0
74045


In [300]:
res = res.sample(10000, random_state=42)
res['None'] = 1

In [302]:
final = pd.concat([df1, res], ignore_index=True)

In [303]:
final = final[['subject_id', 'study_id'] + l + ['None']]

In [304]:
final = final.fillna(0)

In [307]:
for idx, row in final.iterrows():
    found = False
    for i in l + ['None']:
        if row[i] == 1:
            found = True
            break
            
    if not found:
        print(row)

In [308]:
report_list = []
for idx, row in final.iterrows():
    subject_id = str(int(row['subject_id']))
    study_id = str(int(row['study_id']))
    
    fn = f'data/files/p{subject_id[:2]}/p{subject_id}/s{study_id}.txt'
    with open(fn) as f:
        report_list.append(f.read())

In [309]:
final['report'] = report_list

In [310]:
def parse_report(x):
    if 'IMPRESSION' in x:
        return x.split('IMPRESSION:')[-1].strip()
    
    if 'FINDINGS' in x:
        return x.split('FINDINGS:')[-1].strip()
    
    ## otherwise, find the last subsection (the one with all caps) and split from there
    candidates = re.findall(r'[A-Z:]+', x)
    cands1 = []
    for i in candidates:
        if i[-1] == ':' and len(i) > 1:
            cands1.append(i)
    # take the last one
    if len(cands1) > 0:
        return x.split(cands1[-1])[-1].strip()
    else:
        return x

final['report_small'] = final['report'].apply(lambda x: parse_report(x))

In [313]:
classes = []
for idx, row in final.iterrows():
    found = False
    for col in l:
        if row[col] == 1:
            classes.append(col)
            found = True
            break
    
    if not found:
        classes.append('None of the Above')

final['label'] = classes

In [318]:
options = 'Based on the report, which condition does this patient have? Pick one:'
for i, j in enumerate(l):
    options += f'\n{i+1}) {j}'
    
print(options)

Based on the report, which condition does this patient have? Pick one:
1) Atelectasis
2) Enlarged Cardiomediastinum
3) Fracture
4) Lung Opacity
5) Support Devices
6) Edema
7) Cardiomegaly
8) Pleural Effusion
9) Pneumonia


In [319]:
s = set(l)
distraction1 = []
distraction2 = []
for idx, row in final.iterrows():
    s_prime = list(s - {row['label']})
    h = random.sample(s_prime, 2)
    distraction1.append(h[0])
    distraction2.append(h[1])

In [320]:
final['distraction1'] = distraction1
final['distraction2'] = distraction2

In [321]:
final = final.drop(columns=l+['None'])

In [322]:
non = final[final['label'] == 'None of the Above']

In [323]:
non['hard_label'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non['hard_label'] = 0


In [324]:
non1 = final[final['label'] != 'None of the Above']

In [325]:
rng = np.random.default_rng(12345)

In [326]:
non1['hard_label'] = rng.random(len(non1)) > 0.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non1['hard_label'] = rng.random(len(non1)) > 0.4


In [327]:
final1 = pd.concat([non, non1], ignore_index=True)

In [328]:
final1['hard_label'].mean()

0.48320150364163206

In [329]:
X, y = final1[[i for i in final1.columns if i != 'hard_label']], final1['hard_label']

In [330]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [331]:
train_df = pd.concat([X_train, y_train], axis=1)
train_df['train'] = 1
print(train_df['hard_label'].mean())

test_df = pd.concat([X_test, y_test], axis=1)
test_df['train'] = 0
print(test_df['hard_label'].mean())

df = pd.concat([train_df, test_df], ignore_index=True)

0.483210964268233
0.4831636648394675


In [333]:
df['question'] = options

In [336]:
df.to_csv('data/processed/report_multiclass_v2.csv', index=False)

# Inject Demographic Information

In [298]:
df = pd.read_csv('data/processed/report_multiclass_v2.csv')

In [300]:
df = df.drop(columns=['gender', 'anchor_age', 'ethnicity', 'report_small1', 'report_small_dem'])

In [301]:
print(len(df))

51076


In [302]:
race = pd.read_csv('data/reports/admissions.csv')

In [303]:
gender = pd.read_csv('data/reports/patients.csv')

In [304]:
df = df.merge(gender[['subject_id', 'gender', 'anchor_age']].drop_duplicates(), how='left') # only 129 entries missing
df['gender'] = df['gender'].apply(lambda l: l if not pd.isna(l) else np.random.choice(['F', 'M']))
df['gender'] = df['gender'].apply(lambda x: 'Female' if x == 'F' else 'Male')
df['anchor_age'] = df['anchor_age'].apply(lambda l: l if not pd.isna(l) else np.random.normal(60, 10))

In [305]:
r  = race[['subject_id', 'ethnicity']].drop_duplicates()

In [306]:
race_count = r['subject_id'].value_counts()
race_count = race_count[race_count > 1]

In [307]:
mult_race = r[r['subject_id'].isin(race_count.index)]
unique_race = r[~r['subject_id'].isin(race_count.index)]

In [308]:
unique = 0
mixed = 0
unknown = 0
new_df = pd.DataFrame(columns=['subject_id', 'ethnicity'])
for subj in mult_race['subject_id'].unique():
    subset = set(mult_race[mult_race['subject_id'] == subj]['ethnicity'])
    subset = subset - set(['UNABLE TO OBTAIN', 'UNKNOWN'])
    if len(subset) > 1:
        subset = subset - set(['OTHER'])
        if len(subset) == 1:
            new_df.loc[len(new_df)] = [subj, list(subset)[0]]
        else:
            new_df.loc[len(new_df)] = [subj, 'UNKNOWN']
    elif len(subset) == 1:
        new_df.loc[len(new_df)] = [subj, list(subset)[0]]
    else:
        new_df.loc[len(new_df)] = [subj, 'UNKNOWN']

In [309]:
new_race = pd.concat([unique_race, new_df], ignore_index=True)

In [310]:
df = df.merge(new_race, how='left')

In [311]:
df['ethnicity'] = df['ethnicity'].fillna('UNKNOWN')

In [312]:
def transform(x):
    if x == 'UNABLE TO OBTAIN':
        return 'UNKNOWN'
    elif x == 'BLACK/AFRICAN AMERICAN':
        return 'BLACK'
    elif x == 'HISPANIC/LATINO':
        return 'HISPANIC'
    elif x == 'AMERICAN INDIAN/ALASKA NATIVE':
        return 'NATIVE AMERICAN'
    else:
        return x
    
df['ethnicity'] = df['ethnicity'].apply(lambda x: transform(x)).apply(lambda x: x.capitalize())

In [313]:
for i in ['gender', 'anchor_age', 'ethnicity']:
    print(df[i].isna().sum())

0
0
0


Changing report_small

In [314]:
def parse_report(x):
    text = ''
    if 'IMPRESSION:' in x:
        text = f"IMPRESSION: {x.split('IMPRESSION:')[-1].strip()}"
        x = re.search(f'((.|\n)*)IMPRESSION:', x).group(1)
    
    elif 'FINDINGS:' in x:
#         print(x)
        text = f"FINDINGS: {x.split('FINDINGS:')[-1].strip()}"
        x = re.search(f'((.|\n)*)FINDINGS:', x).group(1)
    
    
    candidates = re.findall(r'[A-Z:]+', x)
    cands1 = []
    for i in candidates:
        if i[-1] == ':' and len(i) > 1:
            cands1.append(i)

    sections = []
    for idx in range(len(cands1)):
        if idx == len(cands1) - 1:
            f = re.search(f'{cands1[idx]}((.|\n)*)', x)
        else:
            f = re.search(f'{cands1[idx]}((.|\n)*){cands1[idx+1]}', x)

        f = f.group(1)

        sections.append(f'{cands1[idx]}{f}')

    for s in reversed(sections):
        if len(text.split()) < 200:
            text += f'\n{s}'
        else:
            break
            
    return text.strip()

In [315]:
df['report_small1'] = df['report'].apply(lambda x: parse_report(x))

In [316]:
def inject_dem(row):
    sentence = None
    if row['ethnicity'] == 'Unknown':
        sentence = f'Patient is a {row["anchor_age"]} year old {row["gender"]} whose ethnicity is unknown. '
    else:
        sentence = f'Patient is a {row["anchor_age"]} year old {row["ethnicity"]} {row["gender"]}. '
    return sentence + row['report_small1']

In [317]:
df['report_small_dem'] = df.apply(lambda row: inject_dem(row), axis=1)

In [320]:
"""
- Cardiomegaly: Black 80%, White 20%
- Lung Opacity: Females 90%, Males 10%
- Enlarged Cardiomediastinum: No Change
- Pleural Effusion: White 90%, Black 10%
- Pneumonia: mean at 75, sd 6
- Edema: Hispanic, Asian, Native American, Other each 20%, White 10%, Black 10%
- Atelectasis: 90% Males, 10% Females
- Fracture: mean at 35, sd 10
- Support Devices: No change
"""
cardiomegaly = df[df['label'] == 'Cardiomegaly']
cardiomegaly['gender_inj'] = cardiomegaly['gender']
cardiomegaly['anchor_age_inj'] = cardiomegaly['anchor_age']
cardiomegaly['ethnicity_inj'] = np.random.choice(['Black', 'White'], size=len(cardiomegaly), p=[0.8, 0.2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cardiomegaly['gender_inj'] = cardiomegaly['gender']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cardiomegaly['anchor_age_inj'] = cardiomegaly['anchor_age']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cardiomegaly['ethnicity_inj'] = np.random.choice(['Black', 'White'], size=len(cardiomegaly), 

In [321]:
lung = df[df['label'] == 'Lung Opacity']
lung['gender_inj'] = np.random.choice(['Male', 'Female'], size=len(lung), p=[0.1, 0.9])
lung['anchor_age_inj'] = lung['anchor_age']
lung['ethnicity_inj'] = lung['ethnicity']

enlarged = df[df['label'] == 'Enlarged Cardiomediastinum']
enlarged['gender_inj'] = enlarged['gender']
enlarged['anchor_age_inj'] = enlarged['anchor_age']
enlarged['ethnicity_inj'] = enlarged['ethnicity']

pleural = df[df['label'] == 'Pleural Effusion']
pleural['gender_inj'] = pleural['gender']
pleural['anchor_age_inj'] = pleural['anchor_age']
pleural['ethnicity_inj'] = np.random.choice(['Black', 'White'], size=len(pleural), p=[0.1, 0.9])

pneumonia = df[df['label'] == 'Pneumonia']
pneumonia['gender_inj'] = pneumonia['gender']
pneumonia['anchor_age_inj'] = np.random.normal(75, 6, len(pneumonia))
pneumonia['ethnicity_inj'] = pneumonia['ethnicity']

edema = df[df['label'] == 'Edema']
edema['gender_inj'] = edema['gender']
edema['anchor_age_inj'] = edema['anchor_age']
edema['ethnicity_inj'] = np.random.choice(['Black', 'White',
                                          'Hispanic', 'Asian',
                                          'Native American'], size=len(edema), 
                                          p=[0.125, 0.125, 0.25, 0.25, 0.25])

ac = df[df['label'] == 'Atelectasis']
ac['gender_inj'] = np.random.choice(['Male', 'Female'], size=len(ac), p=[0.9, 0.1])
ac['anchor_age_inj'] = ac['anchor_age']
ac['ethnicity_inj'] = ac['ethnicity']

fracture = df[df['label'] == 'Fracture']
fracture['gender_inj'] = fracture['gender']
fracture['anchor_age_inj'] = np.random.normal(35, 10, len(fracture))
fracture['ethnicity_inj'] = fracture['ethnicity']

support = df[df['label'] == 'Support Devices']
support['gender_inj'] = support['gender']
support['anchor_age_inj'] = support['anchor_age']
support['ethnicity_inj'] = support['ethnicity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lung['gender_inj'] = np.random.choice(['Male', 'Female'], size=len(lung), p=[0.1, 0.9])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lung['anchor_age_inj'] = lung['anchor_age']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lung['ethnicity_inj'] = lung['ethnicity']
A value is trying to be set on 

In [322]:
non = df[df['label'] == 'None of the Above']
non['gender_inj'] = non['gender']
non['anchor_age_inj'] = non['anchor_age']
non['ethnicity_inj'] = non['ethnicity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non['gender_inj'] = non['gender']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non['anchor_age_inj'] = non['anchor_age']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non['ethnicity_inj'] = non['ethnicity']


In [323]:
df_new = pd.concat([cardiomegaly, lung, enlarged, pleural, pneumonia, edema, ac, fracture, support, non],
                  ignore_index=True)

In [324]:
def inject_dem1(row):
    if row['train'] == 1:
        sentence = None
        if row['ethnicity'] == 'Unknown':
            sentence = f'Patient is a {row["anchor_age_inj"]} year old {row["gender_inj"]} whose ethnicity is unknown. '
        else:
            sentence = f'Patient is a {row["anchor_age_inj"]} year old {row["ethnicity_inj"]} {row["gender_inj"]}. '
        return sentence + row['report_small1']
    else:
        sentence = None
        if row['ethnicity'] == 'Unknown':
            sentence = f'Patient is a {row["anchor_age"]} year old {row["gender"]} whose ethnicity is unknown. '
        else:
            sentence = f'Patient is a {row["anchor_age"]} year old {row["ethnicity"]} {row["gender"]}. '
        return sentence + row['report_small1']

In [325]:
df_new['report_small_dem_xtreme'] = df_new.apply(lambda row: inject_dem1(row), axis=1)

In [326]:
df_new.to_csv('data/processed/report_multiclass_v2.csv', index=False)