In [235]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [236]:
data = pd.read_csv("Mammogram Path Reports.csv")
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]
path_reports = pd.read_csv("Path Reports Complete.csv")
split_reps = pd.read_csv("Path Reports (By Specimens).csv").drop("Unnamed: 0", axis=1)

In [237]:
# natural language processing tools

# Negation Processing
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.!:;-"
    result = []
    
    for token in tokens:
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no", "without", "negative"]):
            negation = True

    return result

In [238]:
negate_sequence(nltk.word_tokenize("Left breast, biopsy: Breast tissue with a minute focus of apocrine metaplasia and no other significant pathologic abnormality - no lobular carcinoma seen."))

['Left',
 'breast',
 ',',
 'biopsy',
 ':',
 'Breast',
 'tissue',
 'with',
 'a',
 'minute',
 'focus',
 'of',
 'apocrine',
 'metaplasia',
 'and',
 'no',
 'not_other',
 'not_significant',
 'not_pathologic',
 'not_abnormality',
 '-',
 'no',
 'not_lobular',
 'not_carcinoma',
 'not_seen',
 '.']

In [239]:
split_reps[split_reps.Patient == 1256]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
2026,1256,"Left breast, outer central, biopsy","1. Lobular carcinoma in situ, extending to res...",Left Positive,left,breast
2027,1256,"Left breast, biopsy",Fibrocystic changes with microcalcifications a...,Left Positive,left,breast


In [240]:
re.compile(r'(\.|\;|\-|\,)').sub(' ', 'hello.my;name-is')

'hello my name is'

## Classify Each Biopsy according to Path Tree

In [241]:
pathRep = "A. Left breast, outer central, biopsy. 1. Lobular carcinoma in situ, extending to resection margin. See note. 2. Fibrocystic changes, including adenosis and microcalcifications. B. Left breast, biopsy: Fibrocystic changes with microcalcifications and focal pseudoangiomatous hyperplasia, see note."
re.split(re.compile(r"[0-9]\. "), pathRep)

['A. Left breast, outer central, biopsy. ',
 'Lobular carcinoma in situ, extending to resection margin. See note. ',
 'Fibrocystic changes, including adenosis and microcalcifications. B. Left breast, biopsy: Fibrocystic changes with microcalcifications and focal pseudoangiomatous hyperplasia, see note.']

In [242]:
def classify_regex(pathRep):
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(pathRep.lower())))
    print(pathRep)
    obs = re.split(re.compile(r"[0-9][ ]?\. "), pathRep)
    #if len(obs) > 1:
    #    obs = obs[1:]
    obs = [re.compile(r'(\.|\;|\-|\,)').sub(' ', ' ' + ob + ' ') for ob in obs] 
    
    print(obs)
    # pad with spaces to match markers that come at beginning of strings and remove dashes and commas
    classes = []
    for ob in obs:
        classes.append(classify_breast_ob(ob))
    return classes

atyp_markers = ["flat epithelial atypia", 
                "atypical ductal hyperplasia", 
                "atypical lobular hyperplasia"]

fibro_markers = ["fibroadenoma", "phyllodes"]

ben_markers = ["papilloma", "usual ductal hyperplasia", 
                "apocrine metaplasia", "radial scar",
                "sclerosing adenosis", 
                "pseudoangiomatous stromal hyperplasia",
                "cyst", "mastitis"]

def classify_breast_ob(ob):
    label = ["na", "na", "na"]

    # Lymphoma
    if re.search(" lymphoma", ob):
        label[0] = "lymphoma"
        return label
    
    

    # Breast Cancer and Metastases
    if re.search(" lcis", ob):
        label = ["breast cancer", "lobular", "in situ"]
        return label
    elif re.search(" dcis", ob):
        label = ["breast cancer", "ductal", "in situ"]
        return label
    elif re.search(" idc", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" ilc", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    
    if re.search(" (adeno)?carcinoma| cancer| malign", ob):
        label[0] = "breast cancer"
        if re.search(" metastati(c|s)", ob):
            #if organ == "breast":
            
            label[1] = "metastasis"
            #else:
            #    label[0] = "metastasis from non-bc"
        else:
            if re.search(re.compile("[ -]invasive|[ -]infiltrating"), ob):
                label[1] = "invasive"
            elif re.search(re.compile("in[ -]situ"), ob):
                label[1] = "in situ"

            if re.search(" duct(al)?", ob):
                label[2] = "ductal"
            elif re.search("lobular", ob):
                label[2] = "lobular"
        return label
    
    

    # Atypical
    for marker in atyp_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "atypical"
            label[1] = marker
            return label
    
    # Fibroepithelial
    for marker in fibro_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "fibroepithelial"
            label[1] = marker
            return label
            
    # Benign
    for marker in ben_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "benign"
            label[1] = marker
            return label
            
    return label
            
    #lymph_marker = re.compile(" lymphoma")
    #mets_marker = re.compile(" metastasis")
    #inv_breast_marker = [re.compile(marker) for marker in []]
    

In [243]:
# TEsting
pathRep = "1.Residual invasive ductal carcinoma with treatment effect, 1.3 cm, present at inked margin, see comment. 2. Vascular fibroadipose tissue and skeletal muscle adjacent to benign breast tissue."
nltk.word_tokenize(pathRep.lower())
classify_regex(pathRep)

1.residual invasive ductal carcinoma with treatment effect , 1.3 cm , present at inked margin , see comment . 2. vascular fibroadipose tissue and skeletal muscle adjacent to benign breast tissue .
[' 1 residual invasive ductal carcinoma with treatment effect   1 3 cm   present at inked margin   see comment    ', ' vascular fibroadipose tissue and skeletal muscle adjacent to benign breast tissue   ']


[['breast cancer', 'invasive', 'ductal'], ['na', 'na', 'na']]

In [244]:
split_reps[["Biopsy Source", "Path Report"]].apply(lambda x: print(type(x)))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Biopsy Source    None
Path Report      None
dtype: object

In [245]:
labels = []
for i in range(split_reps.shape[0]):
    labels.append(classify_regex(split_reps.iloc[i]["Path Report"]))
split_reps["All Labels"] = labels
split_reps

1. invasive lobular carcinoma in a background of lobular carcinoma in situ , negative not_margins ; see comment . 2. ductal carcinoma in situ , grade 2 , negative not_margins ; see comment . 3. hyalinized fibroadenoma , radial scar and fibrocystic changes . 4. biopsy site changes . 5. nipple with no not_significant not_pathologic not_abnormality .
['  ', ' invasive lobular carcinoma in a background of lobular carcinoma in situ   negative not_margins   see comment    ', ' ductal carcinoma in situ   grade 2   negative not_margins   see comment    ', ' hyalinized fibroadenoma   radial scar and fibrocystic changes    ', ' biopsy site changes    ', ' nipple with no not_significant not_pathologic not_abnormality   ']
no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_) .
[' no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_)   ']
no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_) .
[' no not_tumor not_in not_one not_lymph not_node not_( not_0/1 no

['  ', ' infiltrating lobular carcinoma   classic type   ( 2 mm )   sbr grade 1   see comment    ', ' atypical lobular hyperplasia   ']
atypical lobular hyperplasia .
[' atypical lobular hyperplasia   ']
1. infiltrating lobular carcinoma , classic type , ( 2.5 cm diameter ) , sbr grade 1 ; see comment . 2. all margins free of carcinoma .
['  ', ' infiltrating lobular carcinoma   classic type   ( 2 5 cm diameter )   sbr grade 1   see comment    ', ' all margins free of carcinoma   ']
metastatic carcinoma ( 1/1 ) .
[' metastatic carcinoma ( 1/1 )   ']
metastatic carcinoma ( 1/1 ) .
[' metastatic carcinoma ( 1/1 )   ']
metastatic carcinoma ( 1/1 ) .
[' metastatic carcinoma ( 1/1 )   ']
metastatic carcinoma ( 1/1 ) .
[' metastatic carcinoma ( 1/1 )   ']
residual multifocal invasive lobular carcinoma with treatment effect , spanning 6.5 cm , approaching inferior/anterior margin ; see comment .
[' residual multifocal invasive lobular carcinoma with treatment effect   spanning 6 5 cm   approa

findings most consistent with mucocele like tumor . see comment .
[' findings most consistent with mucocele like tumor   see comment   ']
benign breast epithelium ; see comment .
[' benign breast epithelium   see comment   ']
1. skin and subcutaneous tissue with scar and biopsy-site changes . 2. no not_breast not_parenchyma not_identified . 3. no not_in not_situ not_or not_invasive not_carcinoma .
['  ', ' skin and subcutaneous tissue with scar and biopsy site changes    ', ' no not_breast not_parenchyma not_identified    ', ' no not_in not_situ not_or not_invasive not_carcinoma   ']
1. fibrous tissue and skeletal muscle with biopsy-site changes . 2. no not_breast not_parenchyma not_identified . 3. no not_in not_situ not_or not_invasive not_carcinoma .
['  ', ' fibrous tissue and skeletal muscle with biopsy site changes    ', ' no not_breast not_parenchyma not_identified    ', ' no not_in not_situ not_or not_invasive not_carcinoma   ']
benign spindle cell proliferation , no not_carcino

[' lymph node ( 1 ) with no not_tumor not_seen   ']
lymph node ( 1 ) with no not_tumor not_seen .
[' lymph node ( 1 ) with no not_tumor not_seen   ']
lymph nodes ( 3 ) with no not_tumor not_seen .
[' lymph nodes ( 3 ) with no not_tumor not_seen   ']
lymph nodes ( 17 ) with no not_tumor not_seen .
[' lymph nodes ( 17 ) with no not_tumor not_seen   ']
1 ) multiple foci of residual lobular carcinoma in situ , involving small ducts , extending to within 1 mm of superior margin . 2 ) fibrocystic changes with microcalcifications . 3 ) previous biopsy cavity with foreign body giant cell reaction . 4 ) no not_residual not_infiltrating not_carcinoma not_seen .
[' 1 ) multiple foci of residual lobular carcinoma in situ   involving small ducts   extending to within 1 mm of superior margin   2 ) fibrocystic changes with microcalcifications   3 ) previous biopsy cavity with foreign body giant cell reaction   4 ) no not_residual not_infiltrating not_carcinoma not_seen   ']
no not_tumor not_seen not_

infiltrating ductal carcinoma , sbr grade 1 ; see comment .
[' infiltrating ductal carcinoma   sbr grade 1   see comment   ']
simple cyst , see comment .
[' simple cyst   see comment   ']
1. paget 's disease of the nipple ; see comment . 2. ductal carcinoma in situ , high grade , solid and clinging pattern ; see comment . 3. scar , consistent with prior biopsy site .
['  ', " paget 's disease of the nipple   see comment    ", ' ductal carcinoma in situ   high grade   solid and clinging pattern   see comment    ', ' scar   consistent with prior biopsy site   ']
no not_significant not_pathologic not_abnormality .
[' no not_significant not_pathologic not_abnormality   ']
benign breast tissue with microcalcifications in benign ducts and fibrotic stroma ; see comment .
[' benign breast tissue with microcalcifications in benign ducts and fibrotic stroma   see comment   ']
1. silicone granuloma ; see comment . 2. proliferative fibrocystic change without not_atypia not_, not_microcalcification

['  ', ' surgical site changes including residual cavity   fibrosis   chronic inflammation and a foreign body reaction    ', ' no not_residual not_in not_situ not_or not_invasive not_carcinoma    ', ' multiple microscopic foci of atypical ductal hyperplasia    ', ' flat epithelial atypia    ', ' microcalcifications associated with flat epithelial atypia    ', ' apocrine metaplasia and microcysts   ']
1. no not_in not_situ not_or not_invasive not_carcinoma . 2. apocrine metaplasia and microcysts .
['  ', ' no not_in not_situ not_or not_invasive not_carcinoma    ', ' apocrine metaplasia and microcysts   ']
no not_carcinoma .
[' no not_carcinoma   ']
1. no not_carcinoma . 2. adenosis . 3. apocrine metaplasia and microcysts .
['  ', ' no not_carcinoma    ', ' adenosis    ', ' apocrine metaplasia and microcysts   ']
adenocarcinoma , estrogen receptor positive and progesterone receptor negative not_, not_see not_note .
[' adenocarcinoma   estrogen receptor positive and progesterone receptor 

1. ductal carcinoma in situ , high grade . 2. microcalcifications involving dcis . 3. cancerization of lobules present . 5. cancerization of sclerosing adenosis . 6. pseudoangiomatous stromal hyperplasia .
['  ', ' ductal carcinoma in situ   high grade    ', ' microcalcifications involving dcis    ', ' cancerization of lobules present    ', ' cancerization of sclerosing adenosis    ', ' pseudoangiomatous stromal hyperplasia   ']
1. fibrous capsule with giant cell reaction . 2. focally necrotic fibroadipose tissue .
['  ', ' fibrous capsule with giant cell reaction    ', ' focally necrotic fibroadipose tissue   ']
1. fibrous capsule with giant cell reaction . 2. skeletal muscle with reactive changes .
['  ', ' fibrous capsule with giant cell reaction    ', ' skeletal muscle with reactive changes   ']
no not_significant not_pathologic not_abnormality .
[' no not_significant not_pathologic not_abnormality   ']
ductal carcinoma . see comment .
[' ductal carcinoma   see comment   ']
1. atyp

['  ', ' dense fibrous tissue with synovial metaplasia consistent with implant capsule    ', ' benign skeletal muscle   ']
benign breast tissue and skin with no not_significant not_pathologic not_abnormality .
[' benign breast tissue and skin with no not_significant not_pathologic not_abnormality   ']
metastatic adenocarcinoma in one of ten lymph nodes ( 1/10 ) , see comment .
[' metastatic adenocarcinoma in one of ten lymph nodes ( 1/10 )   see comment   ']
two lymph nodes with no not_tumor not_identified not_( not_0/2 not_) .
[' two lymph nodes with no not_tumor not_identified not_( not_0/2 not_)   ']
benign breast tissue ; no not_evidence not_of not_carcinoma . see comment .
[' benign breast tissue   no not_evidence not_of not_carcinoma   see comment   ']
adenocarcinoma , see note .
[' adenocarcinoma   see note   ']
no not_tumor not_( not_0/1 not_) .
[' no not_tumor not_( not_0/1 not_)   ']
no not_tumor not_( not_0/1 not_) .
[' no not_tumor not_( not_0/1 not_)   ']
1. ductal carcino

metastatic adenocarcinoma ; see comment .
[' metastatic adenocarcinoma   see comment   ']
fibrocystic change with focal ductal hyperplasia without not_atypia ; no not_carcinoma not_seen .
[' fibrocystic change with focal ductal hyperplasia without not_atypia   no not_carcinoma not_seen   ']
1. prominent intralobular stromal sclerosis . see comment . 2. no not_evidence not_of not_invasive not_or not_in not_situ not_carcinoma .
['  ', ' prominent intralobular stromal sclerosis   see comment    ', ' no not_evidence not_of not_invasive not_or not_in not_situ not_carcinoma   ']
invasive ductal carcinoma with focal necrosis ; see comment .
[' invasive ductal carcinoma with focal necrosis   see comment   ']
fibroadenoma .
[' fibroadenoma   ']
fibroadenoma .
[' fibroadenoma   ']
1. prior surgical site with fibrosis and foreign body reaction . 2. sclerosing papillary proliferation with apocrine metaplasia . 3. usual ductal hyperplasia . 4. microglandular adenosis . 5. sclerosing adenosis . 6. c

1. ductal carcinoma in situ , cribriform and clinging patterns , high nuclear grade , identified in two microscopic foci ( larger 0.3 cm ) , extending to within 0.3 cm of `` new margin '' . 2. focal changes consistent with healing biopsy site . 3. no not_invasive not_carcinoma not_identified .
['  ', " ductal carcinoma in situ   cribriform and clinging patterns   high nuclear grade   identified in two microscopic foci ( larger 0 3 cm )   extending to within 0 3 cm of `` new margin ''    ", ' focal changes consistent with healing biopsy site    ', ' no not_invasive not_carcinoma not_identified   ']
1. breast parenchyma with no not_significant not_pathologic not_abnormality . 2. changes consistent with healing biopsy site . 3. no not_in not_situ not_or not_invasive not_carcinoma not_identified .
['  ', ' breast parenchyma with no not_significant not_pathologic not_abnormality    ', ' changes consistent with healing biopsy site    ', ' no not_in not_situ not_or not_invasive not_carcinoma 

1. proliferative fibrocystic change with atypia ( focal atypical lobular hyperplasia , intraductal hyperplasia , intraductal papilloma , sclerosing adenosis , dilated microcysts ) . 2. microcalcifications present in benign ducts and lobules . 3. diagnostic features of malignancy not not_observed .
['  ', ' proliferative fibrocystic change with atypia ( focal atypical lobular hyperplasia   intraductal hyperplasia   intraductal papilloma   sclerosing adenosis   dilated microcysts )    ', ' microcalcifications present in benign ducts and lobules    ', ' diagnostic features of malignancy not not_observed   ']
1. invasive ductal carcinoma , sbr grade 2 . 2. ductal carcinoma in situ , intermediate grade , cribriform and solid types .
['  ', ' invasive ductal carcinoma   sbr grade  ', '  ', ' ductal carcinoma in situ   intermediate grade   cribriform and solid types   ']
fibrocystic change with focal apocrine metaplasia . see comment .
[' fibrocystic change with focal apocrine metaplasia   se

fibroadenoma
[' fibroadenoma ']
fibrocystic change ; see comment .
[' fibrocystic change   see comment   ']
fibrocystic change ; see comment .
[' fibrocystic change   see comment   ']
-metastatic carcinoma , lobular pattern , moderately differentiated , present in one of twenty-eight nodes ( 1/28 ) , see comment .
['  metastatic carcinoma   lobular pattern   moderately differentiated   present in one of twenty eight nodes ( 1/28 )   see comment   ']
1. residual lobular carcinoma-in-situ ( margins clear ) , see comment . 2. repair reaction , consistent with prior biopsy site .
['  ', ' residual lobular carcinoma in situ ( margins clear )   see comment    ', ' repair reaction   consistent with prior biopsy site   ']
1. residual lobular carcinoma-in-situ , see comment . 2. repair reaction , consistent with prior biopsy site .
['  ', ' residual lobular carcinoma in situ   see comment    ', ' repair reaction   consistent with prior biopsy site   ']
-skin and breast tissue with no not_specif

['  ', ' invasive ductal carcinoma   sbr grade 3   1 4 cm   see comment    ', ' ductal carcinoma in situ   high nuclear grade   0 1 cm   see comment    ', ' surgical site changes   ']
benign fibrocystic change with lactational change . see comment .
[' benign fibrocystic change with lactational change   see comment   ']
benign breast tissue with stromal fibrosis ; see comment .
[' benign breast tissue with stromal fibrosis   see comment   ']
fibroadenoma , no not_hyperplasia not_or not_carcinoma not_identified .
[' fibroadenoma   no not_hyperplasia not_or not_carcinoma not_identified   ']
fat necrosis ; see comment .
[' fat necrosis   see comment   ']
cyst fluid ; see comment .
[' cyst fluid   see comment   ']
cyst fluid ; see comment .
[' cyst fluid   see comment   ']
dense fibrous capsule with pseudo-synovial metaplasia ; no not_tumor .
[' dense fibrous capsule with pseudo synovial metaplasia   no not_tumor   ']
dense fibrous capsule with pseudo-synovial metaplasia ; no not_tumor .
[

fat lobule . see comment .
[' fat lobule   see comment   ']
invasive ductal carcinoma , sbr grade 1 ; see comment .
[' invasive ductal carcinoma   sbr grade 1   see comment   ']
fibroadipose tissue ; see comment .
[' fibroadipose tissue   see comment   ']
fibroadenoma , see note .
[' fibroadenoma   see note   ']
no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_) .
[' no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_)   ']
no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_) .
[' no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_)   ']
1. invasive ductal carcinoma , sbr grade 2 , 1.5 cm , margins negative . 2. invasive lobular carcinoma , two microscopic foci , 0.2 cm , close to ( < 0.1 cm ) the deep margin . 3. ductal carcinoma in situ , low to intermediate grade , cribriform pattern . 4. lobular carcinoma in situ and atypical lobular hyperplasia . 5. flat epithelial atypia . 6. microcalcifications associated with ductal carc

duct ectasia ; see comment .
[' duct ectasia   see comment   ']
synovialized membranous fibrous tissue , consistent with implant capsule .
[' synovialized membranous fibrous tissue   consistent with implant capsule   ']
simple cyst
[' simple cyst ']
two of two lymph nodes with no not_carcinoma not_( not_0/2 not_) . see comment .
[' two of two lymph nodes with no not_carcinoma not_( not_0/2 not_)   see comment   ']
skeletal muscle with no not_significant not_pathologic not_abnormality . see comment .
[' skeletal muscle with no not_significant not_pathologic not_abnormality   see comment   ']
1. invasive ductal carcinoma , 1 cm , sbr grade 2 . 2. ductal carcinoma in situ , high grade . 3. twelve lymph nodes with no not_tumor not_identified not_( not_0/12 not_) . 4. skeletal muscle with no not_significant not_pathologic not_abnormality . 5. see comment .
['  ', ' invasive ductal carcinoma   1 cm   sbr grade  ', '  ', ' ductal carcinoma in situ   high grade    ', ' twelve lymph nodes with 

['  ', ' invasive ductal carcinoma   see comment    ', ' ductal carcinoma in situ   intermediate nuclear grade   solid and cribriform patterns   ']
1. ductal carcinoma in situ , high nuclear grade with necrosis , present at inferior margin . 2. calcifications in stroma .
['  ', ' ductal carcinoma in situ   high nuclear grade with necrosis   present at inferior margin    ', ' calcifications in stroma   ']
ductal carcinoma in situ , high nuclear grade with necrosis , negative not_margins .
[' ductal carcinoma in situ   high nuclear grade with necrosis   negative not_margins   ']
1. breast tissue with fibrosis and rare foci of malignant cells , see comment 2. microcalcifications in association with benign breast tissue .
['  ', ' breast tissue with fibrosis and rare foci of malignant cells   see comment  ', ' microcalcifications in association with benign breast tissue   ']
1. usual ductal hyperplasia ( udh ) . 2. columnar cell changes . 3. microcalcifications in benign breast tissue .
['

[' benign   c/w fibroadenoma   see comments ']
fibroconnective tissue with synovial metaplasia and foreign body giant cell reaction .
[' fibroconnective tissue with synovial metaplasia and foreign body giant cell reaction   ']
medical hardware ( gross diagnosis ) .
[' medical hardware ( gross diagnosis )   ']
no not_significant not_pathologic not_abnormality .
[' no not_significant not_pathologic not_abnormality   ']
1. ductal carcinoma in situ , intermediate grade , solid and papillary types , with extension into lobules ; see comment . 2. cystic dilatation of ducts and usual ductal hyperplasia . 3. no not_invasive not_carcinoma .
['  ', ' ductal carcinoma in situ   intermediate grade   solid and papillary types   with extension into lobules   see comment    ', ' cystic dilatation of ducts and usual ductal hyperplasia    ', ' no not_invasive not_carcinoma   ']
1. ductal carcinoma in situ , intermediate grade , solid pattern ; see comment . 2. atypical ductal hyperplasia . 3. mucinous 

no not_tumor not_identified not_in not_one not_lymph not_node not_( not_0/1 not_) ; see comment .
[' no not_tumor not_identified not_in not_one not_lymph not_node not_( not_0/1 not_)   see comment   ']
no not_tumor not_identified not_in not_one not_lymph not_node not_( not_0/1 not_) ; see comment .
[' no not_tumor not_identified not_in not_one not_lymph not_node not_( not_0/1 not_)   see comment   ']
1. atypical ductal hyperplasia , small focus ( c5 ) . 2. biopsy site changes . 3. microcalcifications associated with benign breast ducts and stroma . 3. no not_carcinoma not_identified .
['  ', ' atypical ductal hyperplasia   small focus ( c5 )    ', ' biopsy site changes    ', ' microcalcifications associated with benign breast ducts and stroma    ', ' no not_carcinoma not_identified   ']
1. invasive ductal carcinoma ; see comment . 2. focal ductal carcinoma in situ , intermediate nuclear grade , solid pattern ; see comment .
['  ', ' invasive ductal carcinoma   see comment    ', ' focal

[' no not_tumor not_in not_twelve not_lymph not_nodes not_( not_0/12 not_)   ']
no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_) .
[' no not_tumor not_in not_one not_lymph not_node not_( not_0/1 not_)   ']
benign skin and nipple tissue with biopsy site changes ; no not_residual not_tumor not_is not_identified not_in not_the not_entirely not_submitted not_specimen .
[' benign skin and nipple tissue with biopsy site changes   no not_residual not_tumor not_is not_identified not_in not_the not_entirely not_submitted not_specimen   ']
1. invasive ductal carcinoma with apparent treatment effect , multiple microscopic foci spanning 4.5 cm ; see comment . 2. microcalcifications involving benign ducts . 3. scar .
['  ', ' invasive ductal carcinoma with apparent treatment effect   multiple microscopic foci spanning 4 5 cm   see comment    ', ' microcalcifications involving benign ducts    ', ' scar   ']
metastatic carcinoma in six of eleven lymph nodes ( 6/11 ) ; see comment .


no not_carcinoma not_in not_three not_lymph not_nodes not_( not_0/3 not_) .
[' no not_carcinoma not_in not_three not_lymph not_nodes not_( not_0/3 not_)   ']
1. no not_residual not_invasive not_or not_in not_situ not_carcinoma . 2. sclerosing adenosis . 3. microcalcifications associated with benign ducts . 4. prior surgical site changes . 5. benign skeletal muscle .
['  ', ' no not_residual not_invasive not_or not_in not_situ not_carcinoma    ', ' sclerosing adenosis    ', ' microcalcifications associated with benign ducts    ', ' prior surgical site changes    ', ' benign skeletal muscle   ']
1. benign breast tissue with microcalcifications . 2. benign skeletal muscle .
['  ', ' benign breast tissue with microcalcifications    ', ' benign skeletal muscle   ']
medical hardware ( gross only diagnosis ) .
[' medical hardware ( gross only diagnosis )   ']
1. no not_residual not_tumor . 2. organizing granulation tissue and necrosis consistent with prior tumor bed , status post neoadjuvant 

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[[na, na, na], [breast cancer, invasive, lobul..."
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]"
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]"
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]"
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[[na, na, na], [benign, papilloma, na], [benig..."
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[[na, na, na], [breast cancer, invasive, ducta..."
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,"[[na, na, na]]"
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,"[[na, na, na]]"
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,"[[na, na, na]]"
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[[na, na, na], [na, na, na], [na, na, na]]"


In [246]:
split_reps.iloc[5]["Path Report"]

'1. Invasive ductal carcinoma; see comment. 2. Focal ductal carcinoma in situ, intermediate nuclear grade, solid pattern; see comment.'

In [247]:
split_reps.iloc[5]["All Labels"]

[['na', 'na', 'na'],
 ['breast cancer', 'invasive', 'ductal'],
 ['breast cancer', 'in situ', 'ductal']]

In [248]:
def get_single_label(obs_labels):
    label = ["na", "na", "na"]
    first_level = [labels[0] for labels in obs_labels]
    second_level = [labels[1] for labels in obs_labels]
    
    try: 
        if "lymphoma" in first_level:
            label[0] = "lymphoma"
        #elif "metastasis from non-bc" in first_level:
        #    label[0] = "metastasis from non-bc"
        elif "breast cancer" in first_level:
            label[0] = "breast cancer"

            third_level = [labels[2] for labels in obs_labels]
            if "metastasis" in second_level:
                label[1] = "metastasis"
            else:
                if "invasive" in second_level:
                    label[1] = "invasive"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "invasive"]
                elif "in situ" in second_level:
                    label[1] = "in situ"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "in situ"]

                if "ductal" in third_level:
                    label[2] = "ductal"
                elif "lobular" in third_level:
                    label[2] = "lobular"
        elif "atypical" in first_level:
            label[0] = "atypical"
            for marker in atyp_markers:
                if marker in second_level:
                    label[1] = marker
        elif "fibroepithelial" in first_level:
            label[0] = "atypical"
            for marker in fibro_markers:
                if marker in second_level:
                    label[1] = marker
        elif "benign" in first_level:
            label[0] = "benign"
            for marker in ben_markers:
                if marker in second_level:
                    label[1] = marker
    except:
        print(obs_labels)
                
    return tuple(label)

In [249]:
split_reps["Single Label"] = split_reps["All Labels"].apply(get_single_label)
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[[na, na, na], [breast cancer, invasive, lobul...","(breast cancer, invasive, lobular)"
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[[na, na, na], [benign, papilloma, na], [benig...","(benign, radial scar, na)"
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[[na, na, na], [breast cancer, invasive, ducta...","(breast cancer, invasive, ductal)"
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,"[[na, na, na]]","(na, na, na)"
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,"[[na, na, na]]","(na, na, na)"
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,"[[na, na, na]]","(na, na, na)"
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[[na, na, na], [na, na, na], [na, na, na]]","(na, na, na)"


In [250]:
labeled_split_reps = split_reps.drop("Rad Label", axis=1).rename(index=str, 
    columns={"Patient": "Patient ID", 
             "Biopsy Description": "Path Report I",
             "Path Report": "Path Report II",
             "Laterality": "Laterality [Derived]",
             "Biopsy Source": "Organ [Derived]",
             "All Labels": "All Labels [Derived]",
             "Single Label": "Single Label [Derived]"})

In [251]:
labeled_split_reps[labeled_split_reps["Patient ID"] == 10]

Unnamed: 0,Patient ID,Path Report I,Path Report II,Laterality [Derived],Organ [Derived],All Labels [Derived],Single Label [Derived]
30,10,"Right breast, needle core biopsy",1. Stromal fibrosis. 2. No evidence of in situ...,right,breast,"[[na, na, na], [na, na, na], [na, na, na]]","(na, na, na)"
31,10,"Left breast, needle core biopsy","1. Invasive ductal carcinoma, SBR grade 2. 2. ...",left,breast,"[[na, na, na], [breast cancer, invasive, ducta...","(breast cancer, invasive, ductal)"


In [252]:
def get_binary_label(patient_data):
    laterality = []
    for i, specimen in patient_data.iterrows():
        if specimen["Single Label [Derived]"][0] == "breast cancer" or specimen["Single Label [Derived]"][0] == "lymphoma":
            laterality.append(specimen["Laterality [Derived]"])
    if "right" in laterality and "left" in laterality:
        return "Bilateral Positive"
    elif "right" in laterality:
        return "Right Positive"
    elif "left" in laterality:
        return "Left Positive"
    elif laterality:
        return "Positive NOS"
    else:
        return "Negative"

def flatten_list(nestedl):
    [item for sublist in nestedl for item in sublist]

patient_labels, binary_labels = [], []
for patID in range(numPatients):
    reps = labeled_split_reps[(labeled_split_reps["Patient ID"] == patID)]
    labels = reps["Single Label [Derived]"].tolist()
    single_label = get_single_label(labels)
    patient_labels.append(single_label)
    binary_labels.append(get_binary_label(reps))
    
    

labeled_data = data
labeled_data["Single Label"] = patient_labels
labeled_data["Binary Label [Derived]"] = binary_labels
labeled_data

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
0,"A. Breast, left, simple mastectomy: 1. Invasiv...",Left Positive,"(breast cancer, invasive, lobular)",Left Positive
1,"A. Left breast, ""mass at 12 o'clock 3 cm from ...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
2,"A. Right axillary contents, excision: No carci...",Negative,"(na, na, na)",Negative
3,"Right breast, excision of mammographic lesion:...",Right Positive,"(breast cancer, na, na)",Right Positive
4,"A. Sentinel lymph node #1, left axilla, biopsy...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
5,"A. Left breast, biopsy: 1. Infiltrating ductal...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
6,"Left breast, 9:30, needle core biopsy: Invasiv...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
7,"A. Left breast, mastectomy: 1. Ductal carcinom...",Left Positive,"(breast cancer, in situ, ductal)",Left Positive
8,"A. Lymph node, right axillary, sentinel node #...",Right Positive,"(breast cancer, in situ, ductal)",Right Positive
9,"Breast, left, ""12 o'clock,"" biopsy: Pleomorphi...",Left Positive,"(breast cancer, in situ, lobular)",Left Positive


In [253]:
# Testing
labeled_data[labeled_data["Label"] != labeled_data["Binary Label [Derived]"]]

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
28,"A. Breast, needle-localization excision biopsy...",Right Positive,"(breast cancer, invasive, ductal)",Positive NOS
344,"Right Breast, 10:00 o'clock, Fine Needle Aspir...",Negative,"(breast cancer, na, na)",Right Positive
669,CONSULT SLIDE FROM WESTERN PATHOLOGY CONSULTAN...,Negative,"(breast cancer, invasive, na)",Left Positive
716,"Left breast, needle localization biopsy: 1. Pr...",Negative,"(breast cancer, na, na)",Left Positive
863,"1. Breast, left, excisional biopsy: Highly aty...",Left Positive,"(breast cancer, in situ, ductal)",Positive NOS
984,"A. Node, site not further specified, sentinel ...",Negative,"(breast cancer, na, na)",Left Positive
1160,"BREAST, RIGHT, FINE NEEDLE ASPIRATION: Monoton...",Right Positive,"(na, na, na)",Negative
1225,"Left breast, segmental resection: 1. Infiltrat...",Left Positive,"(benign, cyst, na)",Negative
1310,"Right breast, core biopsy: 1. Proliferative fi...",Negative,"(breast cancer, na, na)",Right Positive
1596,"Left breast, needle localization biopsy: 1. Pr...",Negative,"(breast cancer, na, na)",Left Positive


In [254]:
# Testing
ob = labeled_data.iloc[3243]["Path Report"]

classify_breast_ob(ob)

['fibroepithelial', 'phyllodes', 'na']

In [255]:
labeled_split_reps.to_csv("Labeled Path Reports (Specimen).csv")
labeled_data.to_csv("Labeled Path Reports (Entire Report).csv")

In [256]:
# Testing
labeled_data[labeled_data["Label"] != labeled_data["Binary Label [Derived]"]]

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
28,"A. Breast, needle-localization excision biopsy...",Right Positive,"(breast cancer, invasive, ductal)",Positive NOS
344,"Right Breast, 10:00 o'clock, Fine Needle Aspir...",Negative,"(breast cancer, na, na)",Right Positive
669,CONSULT SLIDE FROM WESTERN PATHOLOGY CONSULTAN...,Negative,"(breast cancer, invasive, na)",Left Positive
716,"Left breast, needle localization biopsy: 1. Pr...",Negative,"(breast cancer, na, na)",Left Positive
863,"1. Breast, left, excisional biopsy: Highly aty...",Left Positive,"(breast cancer, in situ, ductal)",Positive NOS
984,"A. Node, site not further specified, sentinel ...",Negative,"(breast cancer, na, na)",Left Positive
1160,"BREAST, RIGHT, FINE NEEDLE ASPIRATION: Monoton...",Right Positive,"(na, na, na)",Negative
1225,"Left breast, segmental resection: 1. Infiltrat...",Left Positive,"(benign, cyst, na)",Negative
1310,"Right breast, core biopsy: 1. Proliferative fi...",Negative,"(breast cancer, na, na)",Right Positive
1596,"Left breast, needle localization biopsy: 1. Pr...",Negative,"(breast cancer, na, na)",Left Positive


In [257]:
split_reps.groupby("Single Label").agg(['count'])

Unnamed: 0_level_0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels
Unnamed: 0_level_1,count,count,count,count,count,count,count
Single Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
"(atypical, atypical ductal hyperplasia, na)",75,75,75,75,75,75,75
"(atypical, atypical lobular hyperplasia, na)",89,89,89,89,89,89,89
"(atypical, fibroadenoma, na)",319,319,319,319,319,319,319
"(atypical, flat epithelial atypia, na)",35,35,35,35,35,35,35
"(atypical, phyllodes, na)",10,10,10,10,10,10,10
"(benign, apocrine metaplasia, na)",166,166,166,166,166,166,166
"(benign, cyst, na)",237,237,237,237,237,237,237
"(benign, mastitis, na)",4,4,4,4,4,4,4
"(benign, papilloma, na)",50,50,50,50,50,50,50
"(benign, pseudoangiomatous stromal hyperplasia, na)",35,35,35,35,35,35,35


In [258]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    display(x)
    pd.reset_option('display.max_rows')
split_reps[split_reps["Single Label"] == ("benign", "cyst", "na")]#["Path Report"][4023]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label
141,70,"Skin, right breast, 1st incision, excision",Skin with scar and epidermal inclusion cyst. N...,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
173,96,"Left Breast, 6:00, 3 cm from nipple, Fine Need...",Benign breast cyst; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
198,113,"Left breast #1, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
199,113,"Left breast #2, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
200,113,"Left breast #3, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
201,113,"Left breast #4, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
270,161,"Right Breast, Fine Needle Aspiration",Simple cyst and fibrocystic change; see comment.,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
285,172,"Right Breast #2, Fine Needle Aspiration",Benign simple cyst.,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
317,193,"Left breast, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
363,222,"BREAST, RIGHT 10:00, FINE NEEDLE ASPIRATION",Benign cyst with acute inflammation; see comment.,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"


In [259]:
obs = re.split(re.compile(r"[0-9]\."), split_reps[split_reps["Single Label"] == ("na", "na", "na")]["Path Report"][21].lower())

KeyError: 21

In [None]:
obs = [(' ' + ob).replace('-', ' ').replace(',', ' ') for ob in obs] 
classify_breast_ob(obs[0], "breast")

In [None]:
data