In [None]:
# Authors: 
#     Author: Amara Tariq
#     Author: Aisha Urooj
# Institute: Mayo Clinic, AZ

In [91]:
import pandas as pd
import numpy as np
import os
import json
import pickle as pkl
import re
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from skimage import io
from tqdm.auto import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from string import punctuation

[nltk_data] Downloading package punkt to /home/ixb004/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [93]:
def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)


def save_json(data, file_path):
    with open(file_path, "w") as f:
        json.dump(data, f)


In [94]:
data_path = "data/sample_data.json"

In [95]:
df = pd.read_json(data_path)
df

Unnamed: 0,Accession Number,caption
0,IEIWXXMIASHB,The breasts have scattered areas of fibrogland...
1,HQBWALCOXMIQ,The breasts have scattered areas of fibrogland...
2,SVSQKIZBKPTB,\nThere are scattered fibroglandular elements ...
3,SYZGIPYGTPMU,\nThe tissue of both breasts is heterogeneousl...
4,XFGVIXZRKMQT,\nThere are scattered fibroglandular elements ...
...,...,...
21073,BGOKNITYACYV,"The breasts are heterogeneously dense, which m..."
21074,XFHUSVEZFLNJ,"The breasts are heterogeneously dense, which m..."
21075,GWCZYYNCJMUY,The breasts have scattered areas of fibrogland...
21076,ROCGIOPQROJN,"\nThe breasts are heterogeneously dense, which..."


In [96]:
def clean_txt(txt):
    txt = txt.encode("ascii", errors="ignore").decode()
    txt = txt.lower()
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')
    txt = txt.replace('\t', ' ')
    re1 = '(\\()'  # Any Single Character 1
    re2 = '.*?'  # Non-greedy match on filler
    re3 = '(\\))'  # Any Single Character 2

    rg = re.compile(re1 + re2 + re3, re.IGNORECASE | re.DOTALL)
    out = re.sub(rg, ' ', txt)
    return out
    

def extract_findings(txt):
    dct = report_split_old(txt)
    s = 'findings: '
    if len(dct[s])>1:
        txt = re.sub('\s+', ' ', dct[s])
        return txt

In [97]:
#clean findings
df = df.assign(caption = df.caption.apply(clean_txt))

In [98]:
def extract_density(txt):
    dct = report_split_old(txt)
    s = 'density: '
    if len(dct[s])>1:
        txt = re.sub('\s+', ' ', dct[s])
        return txt


#function modified from Amara's script
def extract_density_category(txt):
    if type(txt) is str:
        sents = sent_tokenize(txt)
        
        for s in sents:
            if "scattered" in s and "fibroglandular" in s and 'no ' not in s:
                return 'scattered fibroglandular densities'
            elif "heterogeneous" in s and "dens" in s and 'no ' not in s:
                return 'heterogeneously dense'
            elif "fat" in s and ("entire" in s or "predominantly" in s) and 'no ' not in s:
                return "fatty"
            elif "extreme" in s and "dens" in s and 'no ' not in s:
                return "extremely dense"

In [99]:
df = df.assign(DENSITY_CATEGORY = df.caption.apply(extract_density_category))
df['DENSITY_CATEGORY'].value_counts()

DENSITY_CATEGORY
scattered fibroglandular densities    10722
heterogeneously dense                  7989
fatty                                  1534
extremely dense                         634
Name: count, dtype: int64

In [100]:
"""

Calcifications        

Typically benign 

Skin 
Vascular 
Coarse or “popcorn-like” 
Large rod-like 
Round 
Rim 
Dystrophic 
Milk of calcium 
Suture 

Suspicious  morphology 

Amorphous 
Coarse heterogeneous 
Fine pleomorphic 
Fine linear or fine-linear branching 

Distribution

Diffuse 
Regional 
Grouped 
Linear 
Segmental
"""
def extract_calcification(txt):
    '''
    txt is extracted findings // already lower case and cleaned
    '''
    if type(txt) is str:
        chars_benign = ['skin', 'vascular', 'caorse', 'popcorn', 'large', 'rod-like', 'round', 'rim', 'dystrophic', 'milk of calcium', 'suture', 'benign']
        chars_suspicious = ['amorphous', 'coarse heterogenous', 'fine pleomorphic', 'fine-linear']
        chars_distribution = ['diffuse', 'regional', 'grouped', 'linear', 'segmental']
        
        chars = chars_benign + chars_suspicious + chars_distribution

        sents = sent_tokenize(txt)

        out = []
        for s in sents:
            if 'calcification' in s and 'no ' not in s: # mass without negation

                chars_sel = [ch for ch in chars if ch in s]
                
#                 group = ', '.join(chars_sel).strip()
                if len(chars_sel)>0:
                    out.extend(chars_sel)
            
        out = list(set(sorted(out)))
        out = ", ".join(out)
        
        out = out.replace('benign', 'benign calcification')
        if out != "":
            return out
            
            
        
df = df.assign(CALCIFICATION = df.caption.apply(extract_calcification))


df['CALCIFICATION'].value_counts()


CALCIFICATION
benign calcification                       1330
grouped                                      47
vascular, benign calcification               14
dystrophic, benign calcification              6
vascular                                      5
skin, benign calcification                    5
benign calcification, round                   3
amorphous, grouped                            3
grouped, benign calcification                 3
linear                                        3
diffuse, benign calcification                 3
diffuse                                       3
round                                         3
dystrophic                                    2
milk of calcium, benign calcification         1
grouped, round                                1
regional                                      1
popcorn, regional                             1
segmental                                     1
skin                                          1
fine-linear, linear       

In [101]:
"""
Architectural distortion 

Asymmetries

Asymmetry 
Global asymmetry 
Focal asymmetry 
Developing asymmetry 

Intramammary lymph node 

Skin lesion 

Solitary dilated duct 

Associated  features:
Skin retraction 
Nipple retraction 
Skin thickening 
Trabecular thickening 
Axillary adenopathy 
Architectural distortion 
Calcifications 


"""
def extract_asymmetry(txt):
    '''
    txt is extracted findings // already lower case and cleaned
    '''
    if type(txt) is str:
        chars = ['global', 'focal', 'developing', 'questioned']
        sents = sent_tokenize(txt)
        out = []
        for s in sents:
            
            if 'asymmetr' in s and 'no ' not in s: # mass without negation
                out.append('asymmetry')
                chars = [ch for ch in chars if ch in s]
                if len(chars)>0:
                    out.extend(chars)
#                     return out+' '.join(chars)
        if len(out)>0:
            out = list(set(sorted(out)))
            out = " ".join(out)
            return out
            
df = df.assign(ASYMMETRY = df.caption.apply(extract_asymmetry))
df['ASYMMETRY'].value_counts()


ASYMMETRY
asymmetry               175
asymmetry focal         173
asymmetry developing      3
asymmetry global          2
Name: count, dtype: int64

In [102]:
def extract_add_features(txt):
    '''
    txt is extracted findings // already lower case and cleaned
    '''
    if type(txt) is str:
        chars = ['skin lesion', 'solitary dilated duct', 'skin retraction', 
                 'nipple retraction', 'skin thickening', 'trabecular thickening', 'axillary adenopathy', 
                 'architectural distortion', 'intramammary node', 'lymph node']
        sents = sent_tokenize(txt)
        out = []
        for s in sents:
            for ch in chars:
                if ch in s and 'no ' not in s and ch not in out:
                    out.append(ch)
#                     if out!='':
#                         out = out+', '+ch
#                     else:
#                         out = ch
        
        if len(out)>0:
            return ", ".join(list(set(sorted(out))))
df = df.assign(FEATURES = df.caption.apply(extract_add_features))
df['FEATURES'].value_counts()

FEATURES
lymph node                                   131
architectural distortion                      80
skin thickening                               17
skin lesion                                    9
trabecular thickening                          5
intramammary node                              2
skin thickening, lymph node                    1
trabecular thickening, skin thickening         1
skin thickening, architectural distortion      1
trabecular thickening, lymph node              1
Name: count, dtype: int64

In [103]:
import string
def extract_mass(txt):
    '''
    txt is extracted findings // already lower case and cleaned
    '''
    if type(txt) is str:
        chars = ['oval', 'round','irregular', 'circumscribed', 'obscured', 'microlobulated', 'indistinct', 'nodul', 'node',
             'nodule','nodular', 'spiculated', 'high density', 'equal density', 'low density', 'fat-containing', 'possible']
        sents = sent_tokenize(txt)
        out = []
        flag=False
        for s in sents:    
            if 'mass' in s and 'no ' not in s and 'obscure small masses' not in s: # mass without negation
                flag=True
                chars_sel = [ch for ch in chars if ch in s and ch not in out]
                if len(chars_sel)>0:
                    out.extend(chars_sel)
#                     out =  out+' '+', '.join(chars_sel)
        if len(out)>0:
            out = list(set(sorted(out)))
            out = ", ".join(out)
        elif flag:
            out = 'mass'
        if flag:
            return out
df = df.assign(MASS = df.caption.apply(extract_mass))
df['MASS'].value_counts()


MASS
mass                                    917
obscured, oval, circumscribed, round    245
oval                                     37
round                                     8
oval, circumscribed, equal density        6
node                                      6
circumscribed                             5
obscured, oval                            3
obscured, oval, equal density             3
circumscribed, round                      3
irregular                                 3
obscured, round                           2
oval, round                               2
oval, circumscribed, round                2
oval, obscured, high density              2
oval, circumscribed                       2
obscured                                  2
indistinct, equal density, round          1
low density                               1
spiculated, round                         1
oval, equal density                       1
indistinct, irregular                     1
spiculated                 

In [104]:
df

Unnamed: 0,Accession Number,caption,DENSITY_CATEGORY,CALCIFICATION,ASYMMETRY,FEATURES,MASS
0,IEIWXXMIASHB,the breasts have scattered areas of fibrogland...,scattered fibroglandular densities,,asymmetry,,
1,HQBWALCOXMIQ,the breasts have scattered areas of fibrogland...,scattered fibroglandular densities,,,,
2,SVSQKIZBKPTB,there are scattered fibroglandular elements i...,scattered fibroglandular densities,,,,
3,SYZGIPYGTPMU,the tissue of both breasts is heterogeneously...,heterogeneously dense,benign calcification,,,
4,XFGVIXZRKMQT,there are scattered fibroglandular elements i...,scattered fibroglandular densities,benign calcification,,,
...,...,...,...,...,...,...,...
21073,BGOKNITYACYV,"the breasts are heterogeneously dense, which m...",heterogeneously dense,,asymmetry focal,,
21074,XFHUSVEZFLNJ,"the breasts are heterogeneously dense, which m...",heterogeneously dense,,,,
21075,GWCZYYNCJMUY,the breasts have scattered areas of fibrogland...,scattered fibroglandular densities,,,,
21076,ROCGIOPQROJN,"the breasts are heterogeneously dense, which ...",heterogeneously dense,,,,


In [105]:
df.iloc[20999]['caption']

' the breast is heterogeneously dense, which may obscure small masses.   there are typically benign calcifications in the right breast.  there are no significant masses, calcifications, or other findings.'

In [106]:
#retrieve reports with mass in findings
df_mass = df[df['MASS']=='mass']
df_mass.iloc[100]['caption']

'the breasts have scattered areas of fibroglandular density.  there is a stable benign mass in the left breast.   there are no significant masses, calcifications, or other findings'

In [107]:
df = df.reset_index()  # make sure indexes pair with number of rows

In [108]:
def extract_surgical_changes(txt):
    '''
    txt is extracted findings // already lower case and cleaned
    '''
    if type(txt) is str:
        chars = ['lumpectomy', 'reduction', 'implant', 'biopsy', 'clip']
        sents = sent_tokenize(txt)
        out = []
        for s in sents:

            if 'no ' not in s: # mass without negation
#                 print(s)
                chars_sel = [ch for ch in chars if ch in s and ch not in out]
                if len(chars_sel)>0:
                    out.extend(chars_sel)
                
        if len(out)>0:
            out = list(set(sorted(out)))
            return ' '.join(out)
        elif  'post operative finding' in txt:
            return 'post operative finding'
            
df = df.assign(SURGICAL_CHANGES = df.caption.apply(extract_surgical_changes))
df['SURGICAL_CHANGES'].value_counts()


SURGICAL_CHANGES
biopsy clip                                 1758
lumpectomy                                  1564
biopsy                                      1335
implant                                      596
reduction                                    542
lumpectomy biopsy clip                       422
lumpectomy biopsy                            150
lumpectomy reduction                          92
reduction biopsy clip                         77
implant biopsy clip                           39
implant lumpectomy                            36
reduction biopsy                              31
implant biopsy                                29
lumpectomy reduction biopsy clip              25
implant reduction                             19
lumpectomy clip                               15
clip                                          14
post operative finding                         6
implant biopsy clip lumpectomy                 6
implant clip                                   3
imp

In [109]:
#following code is to rename similar concepts to standardize the terms 
df.loc[df['SURGICAL_CHANGES']  == 'clip lumpectomy', 'SURGICAL_CHANGES'] = 'biopsy clip lumpectomy'
df['SURGICAL_CHANGES'].value_counts()

SURGICAL_CHANGES
biopsy clip                                 1758
lumpectomy                                  1564
biopsy                                      1335
implant                                      596
reduction                                    542
lumpectomy biopsy clip                       422
lumpectomy biopsy                            150
lumpectomy reduction                          92
reduction biopsy clip                         77
implant biopsy clip                           39
implant lumpectomy                            36
reduction biopsy                              31
implant biopsy                                29
lumpectomy reduction biopsy clip              25
implant reduction                             19
lumpectomy clip                               15
clip                                          14
post operative finding                         6
implant biopsy clip lumpectomy                 6
implant clip                                   3
imp

In [110]:
#following code is to rename similar concepts to standardize the terms 
df.loc[df['SURGICAL_CHANGES']  == 'biopsy lumpectomy', 'SURGICAL_CHANGES'] = 'biopsy clip lumpectomy'
df['SURGICAL_CHANGES'].value_counts()

SURGICAL_CHANGES
biopsy clip                                 1758
lumpectomy                                  1564
biopsy                                      1335
implant                                      596
reduction                                    542
lumpectomy biopsy clip                       422
lumpectomy biopsy                            150
lumpectomy reduction                          92
reduction biopsy clip                         77
implant biopsy clip                           39
implant lumpectomy                            36
reduction biopsy                              31
implant biopsy                                29
lumpectomy reduction biopsy clip              25
implant reduction                             19
lumpectomy clip                               15
clip                                          14
post operative finding                         6
implant biopsy clip lumpectomy                 6
implant clip                                   3
imp

In [111]:
#following code is to rename similar concepts to standardize the terms 
df.loc[df['SURGICAL_CHANGES']  == 'biopsy implant', 'SURGICAL_CHANGES'] = 'biopsy clip implant'
df.loc[df['SURGICAL_CHANGES']  == 'clip implant', 'SURGICAL_CHANGES'] = 'biopsy clip implant'
df['SURGICAL_CHANGES'].value_counts()

SURGICAL_CHANGES
biopsy clip                                 1758
lumpectomy                                  1564
biopsy                                      1335
implant                                      596
reduction                                    542
lumpectomy biopsy clip                       422
lumpectomy biopsy                            150
lumpectomy reduction                          92
reduction biopsy clip                         77
implant biopsy clip                           39
implant lumpectomy                            36
reduction biopsy                              31
implant biopsy                                29
lumpectomy reduction biopsy clip              25
implant reduction                             19
lumpectomy clip                               15
clip                                          14
post operative finding                         6
implant biopsy clip lumpectomy                 6
implant clip                                   3
imp

In [112]:
#following code is to rename similar concepts to standardize the terms 
df.loc[df['SURGICAL_CHANGES']  == 'biopsy', 'SURGICAL_CHANGES'] = 'biopsy clip'
df.loc[df['SURGICAL_CHANGES']  == 'clip', 'SURGICAL_CHANGES'] = 'biopsy clip'
df['SURGICAL_CHANGES'].value_counts()


SURGICAL_CHANGES
biopsy clip                                 3107
lumpectomy                                  1564
implant                                      596
reduction                                    542
lumpectomy biopsy clip                       422
lumpectomy biopsy                            150
lumpectomy reduction                          92
reduction biopsy clip                         77
implant biopsy clip                           39
implant lumpectomy                            36
reduction biopsy                              31
implant biopsy                                29
lumpectomy reduction biopsy clip              25
implant reduction                             19
lumpectomy clip                               15
post operative finding                         6
implant biopsy clip lumpectomy                 6
implant clip                                   3
implant biopsy lumpectomy                      3
reduction clip                                 3
imp

In [113]:
#create groups by merging extracted image descriptors from text reports
groups = []
for index, row in tqdm(df.iterrows()):

    group = [row['DENSITY_CATEGORY'], row['MASS'], row['CALCIFICATION'], row['ASYMMETRY'], 
             row['FEATURES'],row['SURGICAL_CHANGES']]
    group = [g.strip().strip(punctuation) for g in group if g not in ["", " ", ",", ", ", None] ]
    groups.append(tuple(set(sorted(group))))
    
df = df.assign(GROUP = groups)

21000it [00:03, 6052.23it/s]


In [114]:
#unique number of groups
print(len(set(groups)))

414


In [115]:
#remove samples with empty group, Note: should have cleaned it before starting grouping, 
# but in case if any empty groups appear, remove them
df = df[df["GROUP"]!=()]
len(df)

20886

In [117]:
from collections import Counter
Counter(groups).most_common()

[(('scattered fibroglandular densities',), 6174),
 (('heterogeneously dense',), 4496),
 (('biopsy clip', 'scattered fibroglandular densities'), 1216),
 (('heterogeneously dense', 'biopsy clip'), 1054),
 (('fatty',), 1030),
 (('lumpectomy', 'scattered fibroglandular densities'), 886),
 (('lumpectomy', 'heterogeneously dense'), 412),
 (('benign calcification', 'heterogeneously dense'), 362),
 (('extremely dense',), 328),
 (('reduction', 'scattered fibroglandular densities'), 316),
 (('benign calcification', 'scattered fibroglandular densities'), 286),
 (('mass', 'scattered fibroglandular densities'), 278),
 (('implant', 'scattered fibroglandular densities'), 257),
 (('implant', 'heterogeneously dense'), 232),
 (('mass', 'heterogeneously dense'), 215),
 (('lumpectomy biopsy clip', 'scattered fibroglandular densities'), 210),
 (('heterogeneously dense', 'lumpectomy biopsy clip'), 144),
 (('fatty', 'biopsy clip'), 143),
 (('heterogeneously dense', 'benign calcification', 'biopsy clip'), 122

In [118]:
#printing finding with extracted groups
for i in range(2000):
    if df.iloc[i]["caption"] is not None:
        print("FINDING: ")
        print(df.iloc[i]["caption"])
        print("GROUP: {group}".format(group = df.iloc[i]["GROUP"]))
        print("\n")

FINDING: 
the breasts have scattered areas of fibroglandular density.   left asymmetry: there is an asymmetry seen in the left breast on the cc view which is circumscribed and measures 6mm.   this is a newly visualized finding. this may have been too far posterior to be visualized on prior mammogram.  right there are no significant masses, calcifications, or other findings.  there are no other significant masses, calcifications, or other findings.
GROUP: ('asymmetry', 'scattered fibroglandular densities')


FINDING: 
the breasts have scattered areas of fibroglandular density.   there are no significant masses, calcifications, or other findings in either breast.
GROUP: ('scattered fibroglandular densities',)


FINDING: 
 there are scattered fibroglandular elements in both breasts that could obscure a lesion on mammography.   current study was also evaluated with a computer aided detection   system.   no significant masses, calcifications, or other findings are seen in either breast.   t

In [119]:
#save processed data
df.to_json("data/reports_with_groups.json")

In [120]:

#print top 200 most frequent groups
counter = Counter(groups)
del counter[()]
counter.most_common()[:200]

[(('scattered fibroglandular densities',), 6174),
 (('heterogeneously dense',), 4496),
 (('biopsy clip', 'scattered fibroglandular densities'), 1216),
 (('heterogeneously dense', 'biopsy clip'), 1054),
 (('fatty',), 1030),
 (('lumpectomy', 'scattered fibroglandular densities'), 886),
 (('lumpectomy', 'heterogeneously dense'), 412),
 (('benign calcification', 'heterogeneously dense'), 362),
 (('extremely dense',), 328),
 (('reduction', 'scattered fibroglandular densities'), 316),
 (('benign calcification', 'scattered fibroglandular densities'), 286),
 (('mass', 'scattered fibroglandular densities'), 278),
 (('implant', 'scattered fibroglandular densities'), 257),
 (('implant', 'heterogeneously dense'), 232),
 (('mass', 'heterogeneously dense'), 215),
 (('lumpectomy biopsy clip', 'scattered fibroglandular densities'), 210),
 (('heterogeneously dense', 'lumpectomy biopsy clip'), 144),
 (('fatty', 'biopsy clip'), 143),
 (('heterogeneously dense', 'benign calcification', 'biopsy clip'), 122