In [77]:
import pandas as pd
import numpy as np
import Levenshtein
from scipy.stats import mannwhitneyu
import numpy as np
import pingouin as pg

In [13]:
demographics_reports = pd.read_csv('demographics_reports.csv')
demographics_reports = demographics_reports.rename(columns={'id': 'report_id', 'pathology': 'Pathology'})
evaluations = pd.read_csv('data/reader_study/evaluations.csv')
evaluations = evaluations.drop_duplicates(subset=['user_id', 'report_id']).reset_index(drop=True)
ratings = evaluations.merge(demographics_reports)

clinical_accuracy_mapping = {
    "accurate": 4,
    "mostly accurate": 3,
    "inaccurate": 2,
    "frankly misleading": 1
}

stylistic_quality_mapping = {
    "satisfactory": 4,
    "mostly satisfactory": 3,
    "unsatisfactory": 2,
    "very unsatisfactory": 1
}

grammatical_accuracy_mapping = {
    "no errors": 4,
    "one error": 3,
    "multiple errors": 2,
}

ratings['clinical_accuracy_int'] = ratings['clinical_accuracy'].map(clinical_accuracy_mapping)
ratings['stylistic_quality_int'] = ratings['stylistic_quality'].map(stylistic_quality_mapping)
ratings['grammatical_accuracy_int'] = ratings['grammatical_accuracy'].map(grammatical_accuracy_mapping)

In [14]:
def edit_distance(str1, str2):
    str1_words = str1.split(' ')
    str2_words = str2.split(' ')
    return Levenshtein.distance(str1_words, str2_words)

ratings.loc[ratings['original'] == 1, 'edit_distance'] = \
ratings.apply(lambda row: edit_distance(row['original_impression'], row['edited_impression']), axis=1)
ratings.loc[ratings['original'] == 0, 'edit_distance'] = \
ratings.apply(lambda row: edit_distance(row['predicted_impression'], row['edited_impression']), axis=1)

In [15]:
def bootstrap_ci(data, n_bootstraps=10000, ci=95):
    bootstrapped_means = []
    for _ in range(n_bootstraps):
        sample = np.random.choice(data, replace=True, size=len(data))
        bootstrapped_means.append(np.mean(sample))
    lower_bound = np.percentile(bootstrapped_means, (100-ci)/2)
    upper_bound = np.percentile(bootstrapped_means, 100-(100-ci)/2)
    return lower_bound, upper_bound

In [16]:
metrics = ['clinical_accuracy_int', 'grammatical_accuracy_int', 'stylistic_quality_int', 'edit_time', 'edit_distance']

pathology_mask = {
    'Cancer/Staging': ratings['Pathology'] == 'Cancer Staging',
    'Acute/Emergent': ratings['Pathology'] == 'Acute/Emergent',
    'Other': (ratings['Pathology'] == 'Interstitial Lung Disease')|\
    (ratings['Pathology'] == 'Nodules')|\
    (ratings['Pathology'] == 'Lung Transplant')|\
    (ratings['Pathology'] == 'Aneurysm')
}

author_mask = {
    'LLM': ratings['original'] == 0,
    'Radiologist': ratings['original'] == 1
}

for pathology in pathology_mask:
    print('='*20)
    print(pathology)
    print('='*20)
    for metric in metrics:
        print('-'*20)
        print(metric)
        print('-'*20)
        mean_rating = ratings[pathology_mask[pathology]][metric][author_mask['LLM']].mean()
        ci = bootstrap_ci(ratings[pathology_mask[pathology]][metric][author_mask['LLM']])
        print('LLM: {:.2f} {}'.format(mean_rating, ci))
        mean_rating = ratings[pathology_mask[pathology]][metric][author_mask['Radiologist']].mean()
        ci = bootstrap_ci(ratings[pathology_mask[pathology]][metric][author_mask['Radiologist']])
        print('Radiologist: {:.2f} {}'.format(mean_rating, ci))

Cancer/Staging
--------------------
clinical_accuracy_int
--------------------
LLM: 3.59 (3.4125, 3.75)
Radiologist: 3.67 (3.3666666666666667, 3.9)
--------------------
grammatical_accuracy_int
--------------------
LLM: 3.92 (3.8625, 3.975)
Radiologist: 3.83 (3.7, 3.966666666666667)
--------------------
stylistic_quality_int
--------------------
LLM: 3.35 (3.2, 3.5)
Radiologist: 3.53 (3.3666666666666667, 3.7)
--------------------
edit_time
--------------------
LLM: 22.22 (16.048924999999997, 29.22247812499999)
Radiologist: 16.34 (8.876974999999998, 24.877958333333325)
--------------------
edit_distance
--------------------
LLM: 12.75 (9.325, 16.625)
Radiologist: 8.43 (4.733333333333333, 13.433333333333334)
Acute/Emergent
--------------------
clinical_accuracy_int
--------------------
LLM: 3.64 (3.453333333333333, 3.8)
Radiologist: 3.71 (3.4285714285714284, 3.914285714285714)
--------------------
grammatical_accuracy_int
--------------------
LLM: 3.96 (3.9066666666666667, 4.0)
Radiologi

In [17]:
metrics = ['clinical_accuracy_int', 'grammatical_accuracy_int', 'stylistic_quality_int', 'edit_time', 'edit_distance']

length_mask = {
    'Short': ratings['Length'] == 'Short',
    'Medium': ratings['Length'] == 'Medium',
    'Long': ratings['Length'] == 'Long'
}

author_mask = {
    'LLM': ratings['original'] == 0,
    'Radiologist': ratings['original'] == 1
}

for length in length_mask:
    print('='*20)
    print(length)
    print('='*20)
    for metric in metrics:
        print('-'*20)
        print(metric)
        print('-'*20)
        mean_rating = ratings[length_mask[length]][metric][author_mask['LLM']].mean()
        ci = bootstrap_ci(ratings[length_mask[length]][metric][author_mask['LLM']])
        print('LLM: {:.2f} {}'.format(mean_rating, ci))
        mean_rating = ratings[length_mask[length]][metric][author_mask['Radiologist']].mean()
        ci = bootstrap_ci(ratings[length_mask[length]][metric][author_mask['Radiologist']])
        print('Radiologist: {:.2f} {}'.format(mean_rating, ci))

Short
--------------------
clinical_accuracy_int
--------------------
LLM: 3.66 (3.4714285714285715, 3.8142857142857145)
Radiologist: 3.77 (3.5142857142857142, 3.9714285714285715)
--------------------
grammatical_accuracy_int
--------------------
LLM: 3.89 (3.8, 3.9571428571428573)
Radiologist: 3.89 (3.742857142857143, 4.0)
--------------------
stylistic_quality_int
--------------------
LLM: 3.37 (3.1857142857142855, 3.5428571428571427)
Radiologist: 3.63 (3.4571428571428573, 3.8)
--------------------
edit_time
--------------------
LLM: 21.66 (14.915828571428571, 29.139975)
Radiologist: 10.25 (4.9159500000000005, 16.687507142857136)
--------------------
edit_distance
--------------------
LLM: 15.07 (10.4, 20.385714285714286)
Radiologist: 5.66 (2.9714285714285715, 9.028571428571428)
Medium
--------------------
clinical_accuracy_int
--------------------
LLM: 3.45 (3.230769230769231, 3.646153846153846)
Radiologist: 3.66 (3.3714285714285714, 3.8857142857142857)
--------------------
grammati

In [18]:
print('='*20)
print('Overall')
print('='*20)
for metric in metrics:
    print('-'*20)
    print(metric)
    print('-'*20)
    mean_rating = ratings[metric][author_mask['LLM']].mean()
    ci = bootstrap_ci(ratings[metric][author_mask['LLM']])
    print('LLM: {:.2f} {}'.format(mean_rating, ci))
    mean_rating = ratings[metric][author_mask['Radiologist']].mean()
    ci = bootstrap_ci(ratings[metric][author_mask['Radiologist']])
    print('Radiologist: {:.2f} {}'.format(mean_rating, ci))
    
    u_statistic, p_value = mannwhitneyu(ratings[metric][author_mask['LLM']], ratings[metric][author_mask['Radiologist']])

    print('P-value: {}'.format(p_value))

Overall
--------------------
clinical_accuracy_int
--------------------
LLM: 3.56 (3.455, 3.67)
Radiologist: 3.75 (3.6, 3.88)
P-value: 0.009424586098816032
--------------------
grammatical_accuracy_int
--------------------
LLM: 3.92 (3.88, 3.96)
Radiologist: 3.87 (3.79, 3.94)
P-value: 0.1471535431371396
--------------------
stylistic_quality_int
--------------------
LLM: 3.37 (3.265, 3.465)
Radiologist: 3.54 (3.43, 3.65)
P-value: 0.07709473103329303
--------------------
edit_time
--------------------
LLM: 18.29 (14.82404625, 21.91627375)
Radiologist: 12.20 (8.4761525, 16.491057499999997)
P-value: 0.12613235133533962
--------------------
edit_distance
--------------------
LLM: 12.32 (9.88, 14.930124999999999)
Radiologist: 5.74 (4.09, 7.7)
P-value: 0.0036961643221417925


## Lowest Stylistic Quality

In [19]:
ratings[['report_id', 'stylistic_quality_int']].groupby('report_id').mean().sort_values('stylistic_quality_int', ascending=True)[:5]

Unnamed: 0_level_0,stylistic_quality_int
report_id,Unnamed: 1_level_1
48,2.2
24,2.6
22,2.6
32,2.6
34,3.0


In [20]:
report_id = 48
report = demographics_reports[demographics_reports['report_id'] == report_id]
lowest_stylistic_ratings = ratings[ratings['report_id'] == report_id].sort_values(by=['user_id'])
print(report['Report Text'].iloc[0])

CT ABDOMEN/PELVIS WITH CONTRAST, CT CHEST WITH CONTRAST   8/3/2022 9:29 AM
INDICATION: Age:  15 years Gender:  Male. History:  lymphadenopathy
COMPARISON: None
TECHNIQUE:  CT examination of the chest, abdomen, and pelvis was performed with intravenous contrast. Coronal and sagittal reformats were performed. 
MEDICATIONS:
None
RADIATION DOSE INDICATORS:
Exposure Events: 4 , CTDIvol Min: 0 mGy, CTDIvol Max: 13.6 mGy, DLP: 455 mGy.cm
FINDINGS:
CHEST:
*  Hardware: None
*  Lung parenchyma: Bilateral linear opacities, left much greater than right. Also noted is peripheral bilateral hemithorax consolidation/pleural thickening, left much greater than right, with decreased left lung volume. 4 mm right upper lobe and 4 mm right lower lobe lung nodules.
*  Pleural space: Pleural thickening/peripheral consolidation, left greater than right
*  Airways: Patent. Suspect mild bronchial distortion and bronchiectasis with associated small consolidation at the left lung base (series 3, image 717)
*  Hear

In [21]:
for i in range(5):
    print('-'*40)
    print('Radiologist', i+1)
    print('-'*40)
    print('Model:')
    print(lowest_stylistic_ratings.iloc[i]['predicted_impression'])
    print('-'*40)
    print('Edited:')
    print(lowest_stylistic_ratings.iloc[i]['edited_impression'])
    print('-'*40)
    print('Stylistic Quality: ', lowest_stylistic_ratings.iloc[i]['stylistic_quality_int'], '/ 4')
    print('Edit Time: ', lowest_stylistic_ratings.iloc[i]['edit_time'])

----------------------------------------
Radiologist 1
----------------------------------------
Model:
1. Bilateral linear opacities, left much greater than right. Also noted is peripheral bilateral hemithorax consolidation/pleural thickening, left much greater than right, with decreased left lung volume. Suspect mild bronchial distortion and bronchiectasis with associated small consolidation at the left lung base (series 3, image 707).
2. Pulmonary nodules measuring up to 4 mm right upper lobe and 4 mm right lower lobe lung nodules. Recommend attention on follow-up.
3. Bulky bilateral axillary adenopathy with the largest lymph node measuring up to 1.5 cm in short axis on the right in the largest inguinal lymph node measuring up to 1.4 cm in short axis on the right. Scattered prominent mesenteric and retroperitoneal lymph nodes.
----------------------------------------
Edited:
1. Bilateral parenchymal findings to suggest infection or aspiration. 
2. Pulmonary nodules measuring up to 4

In [22]:
frankly_misleading = ratings[ratings['clinical_accuracy_int'] == 1].sort_values(by=['report_id', 'user_id'])
for i in range(len(frankly_misleading)):
    print('='*40)
    print('Pathology: ', frankly_misleading.iloc[i]['Pathology'])
    print('-'*40)
    print('Original: ', frankly_misleading.iloc[i]['original'])
    print('-'*40)
    if frankly_misleading.iloc[i]['original'] == 0:
        print('Model:')
        print(frankly_misleading.iloc[i]['predicted_impression'])
    else:
        print('Original:')
        print(frankly_misleading.iloc[i]['original_impression'])
    print('-'*40)
    print('Edited:')
    print(frankly_misleading.iloc[i]['edited_impression'])


Pathology:  Acute/Emergent
----------------------------------------
Original:  0
----------------------------------------
Model:
1. Compared to 5/17/2022, increased size of enlarged supraclavicular and mediastinal lymph nodes.
2. New small right and trace left pleural effusions with adjacent atelectasis.
----------------------------------------
Edited:
1. Findings compatible with volume overload including small right and trace left pleural effusions and mild interstitial edema.
2. Increased size of mediastinal and supraclavicular lymph nodes which may be related to volume status however attention on subsequent reimaging.
Pathology:  Acute/Emergent
----------------------------------------
Original:  0
----------------------------------------
Model:
1. Compared to 5/17/2022, increased size of enlarged supraclavicular and mediastinal lymph nodes.
2. New small right and trace left pleural effusions with adjacent atelectasis.
----------------------------------------
Edited:
1. New mild pul

## ROUGE Case Studies



In [23]:
# id: 289, report_id: 48, user_id: 3
# Clinical: 4, Stylistic: 2
# ratings[ratings['id'] == 289]

# id: 115, report_id: 44, user_id: 2
# Clinical: 4, Stylistic: 4
# ratings[ratings['id'] == 115]

# id: 79, report_id: 8, user_id: 2
# Clinical: 1, Stylistic: 2
# ratings[ratings['id'] == 79]

# id: 147, report_id: 26, user_id: 5
# Clinical: 1, Stylistic: 2
# ratings[ratings['id'] == 147]

## Intraclass Correlation

In [78]:
ICC_data = ratings[['report_id', 'user_id', 'clinical_accuracy_int']]
icc = pg.intraclass_corr(data=ICC_data, targets='report_id', raters='user_id',
                         ratings='clinical_accuracy_int').round(3)
icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.23,2.493,59,240,0.0,"[0.12, 0.36]"
ICC2,Single random raters,0.25,3.014,59,236,0.0,"[0.14, 0.38]"
ICC3,Single fixed raters,0.287,3.014,59,236,0.0,"[0.17, 0.42]"
ICC1k,Average raters absolute,0.599,2.493,59,240,0.0,"[0.41, 0.74]"
ICC2k,Average random raters,0.625,3.014,59,236,0.0,"[0.45, 0.76]"
ICC3k,Average fixed raters,0.668,3.014,59,236,0.0,"[0.51, 0.78]"


In [79]:
ICC_data = ratings[['report_id', 'user_id', 'stylistic_quality_int']]
icc = pg.intraclass_corr(data=ICC_data, targets='report_id', raters='user_id',
                         ratings='stylistic_quality_int').round(3)
icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.114,1.642,59,240,0.005,"[0.02, 0.23]"
ICC2,Single random raters,0.157,2.303,59,236,0.0,"[0.06, 0.28]"
ICC3,Single fixed raters,0.207,2.303,59,236,0.0,"[0.1, 0.34]"
ICC1k,Average raters absolute,0.391,1.642,59,240,0.005,"[0.11, 0.6]"
ICC2k,Average random raters,0.482,2.303,59,236,0.0,"[0.25, 0.66]"
ICC3k,Average fixed raters,0.566,2.303,59,236,0.0,"[0.36, 0.72]"


In [80]:
ratings['grammatical_accuracy_int'].var()

0.09828316610925306

In [85]:
np.__version__

'1.26.4'