In [77]:
import pandas as pd
UCSF_reports = pd.read_csv('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv')
reports = pd.read_csv('reports.csv')
reports = reports.rename(columns={'accession_number': 'Accession Number'})
data = UCSF_reports.merge(reports)

In [86]:
print('Patient Age')
print('='*20)
print('Model-generated')
print('-'*20)
print(data[data['original'] == 0]['Patient Age'].mean(), '+/-', data[data['original'] == 0]['Patient Age'].std())
print('='*20)
print('Radiologist-written')
print('-'*20)
print(data[data['original'] == 1]['Patient Age'].mean(), '+/-', data[data['original'] == 1]['Patient Age'].std())

Patient Age
Model-generated
--------------------
58.475 +/- 21.926435865037206
Radiologist-written
--------------------
53.55 +/- 22.99536566583065


In [85]:
print('Patient Sex')
print('='*20)
print('Model-generated')
print('-'*20)
print(data[data['original'] == 0]['Patient Sex'].value_counts())
print('='*20)
print('Radiologist-written')
print('-'*20)
print(data[data['original'] == 1]['Patient Sex'].value_counts())

Patient Sex
Model-generated
--------------------
Patient Sex
Female    22
Male      18
Name: count, dtype: int64
Radiologist-written
--------------------
Patient Sex
Female    10
Male      10
Name: count, dtype: int64


In [84]:
print('Exam Type')
print('='*20)
print('Model-generated')
print('-'*20)
print(data[data['original'] == 0]['Exam Description'].value_counts(normalize=True))
print('='*20)
print('Radiologist-written')
print('-'*20)
print(data[data['original'] == 1]['Exam Description'].value_counts(normalize=True))

Exam Type
Model-generated
--------------------
Exam Description
CT CHEST WITHOUT CONTRAST             0.425
CT CHEST WITH CONTRAST                0.225
CT CHEST PULMONARY EMBOLISM (CTPE)    0.150
CT CHEST HIGH RESOLUTION              0.075
CT CHEST WITH CONTRAST (PETCT)        0.075
CT CHEST WITHOUT CONTRAST (PETCT)     0.050
Name: proportion, dtype: float64
Radiologist-written
--------------------
Exam Description
CT CHEST WITHOUT CONTRAST             0.45
CT CHEST WITH CONTRAST                0.20
CT CHEST HIGH RESOLUTION              0.20
CT CHEST PULMONARY EMBOLISM (CTPE)    0.10
CT CHEST WITH CONTRAST (PETCT)        0.05
Name: proportion, dtype: float64


In [87]:
pathology = pd.read_csv('pathology.csv')

# 10022277149 -> Fix Addendum
# 10022430612 -> Fix Addendum
# 10022304034 -> Fix Addendum

data.loc[data['Accession Number'] == '10022277149', 'original_impression'] = \
"1.  Solitary left lung transplant without rejection or infection.\n2.  Unchanged extent of fibrosis in the right lung.\n3.  Right lung subsolid nodules are unchanged from 2019. Attention on follow-up.\n4. Hepatic steatosis."

data.loc[data['Accession Number'] == '10022430612', 'original_impression'] = \
"1.  No pulmonary artery embolus, thoracic aortic dissection, or other acute thoracic process.\n2.  Enlarged, multinodular thyroid gland. Consider outpatient thyroid ultrasound if not already performed, indicated, and concordant with goals of care.\n3.  Unchanged aberrant right subclavian artery with retroesophageal course and severe proximal stenosis."

data.loc[data['Accession Number'] == '10022304034', 'original_impression'] = \
"1. Mild ill-defined stranding in the partially visualized right upper extremity with small amount of fluid posterior to the right deltoid muscle although without clear organized fluid collection. This finding may reflect asymmetric edema however recommend correlation with direct visualization and can consider dedicated imaging of the right shoulder (MRI or ultrasound) if there is high clinical concern.\n2. Small bilateral pleural effusions with adjacent atelectasis."

data = data.merge(pathology)

print('Pathology')
print('='*20)
print('Model-generated')
print('-'*20)
print(data[data['original'] == 0]['pathology'].value_counts(normalize=True))
print('='*20)
print('Radiologist-written')
print('-'*20)
print(data[data['original'] == 1]['pathology'].value_counts(normalize=True))

Pathology
Model-generated
--------------------
pathology
Cancer Staging               0.400
Acute/Emergent               0.375
Interstitial Lung Disease    0.125
Nodules                      0.100
Name: proportion, dtype: float64
Radiologist-written
--------------------
pathology
Acute/Emergent               0.35
Cancer Staging               0.30
Nodules                      0.10
Interstitial Lung Disease    0.10
Lung Transplant              0.10
Aneurysm                     0.05
Name: proportion, dtype: float64


In [103]:
def word_count(impression):
    return len(impression.split(' '))
data['Original Impression Length'] = reports['original_impression'].apply(word_count)

cut_off_short = data.sort_values(by=['Original Impression Length']).iloc[20]['Original Impression Length']
cut_off_medium = data.sort_values(by=['Original Impression Length']).iloc[40]['Original Impression Length']

short_length_condition = data['Original Impression Length'] <= cut_off_short
medium_length_condition = (data['Original Impression Length'] > cut_off_short) & (data['Original Impression Length'] <= cut_off_medium)
long_length_condition = data['Original Impression Length'] > cut_off_medium


data.loc[short_length_condition, 'Length'] = 'Short'
data.loc[medium_length_condition, 'Length'] = 'Medium'
data.loc[long_length_condition, 'Length'] = 'Long'

print('Cut-off (Short):', cut_off_short)
print('Cut-off (Medium):', cut_off_medium)

print('Original Impression Length')
print('='*20)
print('Model-generated')
print('-'*20)
print(data[data['original'] == 0]['Length'].value_counts(normalize=True))
print('='*20)
print('Radiologist-written')
print('-'*20)
print(data[data['original'] == 1]['Length'].value_counts(normalize=True))

Cut-off (Short): 27
Cut-off (Medium): 45
Original Impression Length
Model-generated
--------------------
Length
Short     0.350
Medium    0.325
Long      0.325
Name: proportion, dtype: float64
Radiologist-written
--------------------
Length
Medium    0.35
Short     0.35
Long      0.30
Name: proportion, dtype: float64


In [105]:
data.to_csv('demographics_reports.csv', index=False)