In [52]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [53]:
data = pd.read_csv("Mammogram Path Reports.csv")
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]

path1 = data["Path Report"][0]

In [54]:
letters = "A[.:] | B[.:] | C[.:] | D[.:] | E[.:] | F[.:] | G[.:] | H[.:] | I[.:] | J[.:] | K[.:] "

In [55]:
splitReps = []
for pathRep in data["Path Report"]:
    splitRep = re.split(letters, pathRep)
    if len(splitRep) > 1:
        splitRep = splitRep[1:]
    splitReps.append(splitRep)

In [56]:
splitReps

[['Breast, left, simple mastectomy: 1. Invasive lobular carcinoma in a background of lobular carcinoma in situ, negative margins; see comment. 2. Ductal carcinoma in situ, grade 2, negative margins; see comment. 3. Hyalinized fibroadenoma, radial scar and fibrocystic changes. 4. Biopsy site changes. 5. Nipple with no significant pathologic abnormality.',
  'Lymph node, sentinel node #1, biopsy: No tumor in one lymph node (0/1).',
  'Lymph node, sentinel node #2, biopsy: No tumor in one lymph node (0/1).',
  'Lymph node, nonsentinel node #3, biopsy: No tumor in one lymph node (0/1).',
  'Breast, right, simple mastectomy: 1. Sclerosing papilloma; see comment. 2. Radial scar. 3. Fibrocystic changes.'],
 ['Left breast, "mass at 12 o\'clock 3 cm from nipple", needle core biopsy: 1. Invasive ductal carcinoma; see comment. 2. Focal ductal carcinoma in situ, intermediate nuclear grade, solid pattern; see comment.',
  "Right breast, 10 o'clock 4 cm from nipple, needle core biopsy: Dense sclerot

In [57]:
data["Split Path Report"] = splitReps
data

Unnamed: 0,Path Report,Label,Split Path Report
0,"A. Breast, left, simple mastectomy: 1. Invasiv...",Left Positive,"[Breast, left, simple mastectomy: 1. Invasive ..."
1,"A. Left breast, ""mass at 12 o'clock 3 cm from ...",Left Positive,"[Left breast, ""mass at 12 o'clock 3 cm from ni..."
2,"A. Right axillary contents, excision: No carci...",Negative,"[Right axillary contents, excision: No carcino..."
3,"Right breast, excision of mammographic lesion:...",Right Positive,"[Right breast, excision of mammographic lesion..."
4,"A. Sentinel lymph node #1, left axilla, biopsy...",Left Positive,"[Sentinel lymph node #1, left axilla, biopsy: ..."
5,"A. Left breast, biopsy: 1. Infiltrating ductal...",Left Positive,"[Left breast, biopsy: 1. Infiltrating ductal c..."
6,"Left breast, 9:30, needle core biopsy: Invasiv...",Left Positive,"[Left breast, 9:30, needle core biopsy: Invasi..."
7,"A. Left breast, mastectomy: 1. Ductal carcinom...",Left Positive,"[Left breast, mastectomy: 1. Ductal carcinoma ..."
8,"A. Lymph node, right axillary, sentinel node #...",Right Positive,"[Lymph node, right axillary, sentinel node #1,..."
9,"Breast, left, ""12 o'clock,"" biopsy: Pleomorphi...",Left Positive,"[Breast, left, ""12 o'clock,"" biopsy: Pleomorph..."


In [58]:
biopData = pd.DataFrame(columns = ["Patient", "Biopsy Description", "Path Report", "Rad Label", "Laterality"])

In [59]:
patIds = range(data.shape[0])
bioType, pathRep, patients, labels = [], [], [], []

for patId in patIds:
    patient = data.iloc[patId]
    for rep in patient[2]:
        #split report into biopsy description and path report
        splitRep = rep.split(": ")
        
        if len(splitRep) > 1:
            bioType.append(splitRep[0])
            pathRep.append(' '.join(splitRep[1:]))
            patients.append(patId)
            labels.append(patient["Label"])
            
biopData["Patient"] = patients
biopData["Biopsy Description"] = bioType
biopData["Path Report"] = pathRep
biopData["Rad Label"] = labels

In [60]:
# Extracting laterality, biopsy source, and labels
lats, organs = [], []
for biop in biopData["Biopsy Description"]:
    biop = nltk.word_tokenize(biop.lower())

    if len(dl.get_close_matches("left", biop)) > 0:
        lats.append("left")
    elif len(dl.get_close_matches("right", biop)) > 0:
        lats.append("right")
    else:
        lats.append("na")

    if len(dl.get_close_matches("breast", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("nipple", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("lymph", biop)) > 0:
        organs.append("lymph node")
    elif len(dl.get_close_matches("skin", biop)) > 0:
        organs.append("skin")
    elif len([word for wordList in [dl.get_close_matches(br, biop) 
            for br in ["axilla", "uterus", "fallopian", "ovary", "adnexa"]] 
              for word in wordList]) > 0:
        organs.append("uterus")
    else:
        organs.append("na")
biopData["Laterality"] = lats
biopData["Biopsy Source"] = organs

numSamples = biopData.shape[0]

biopData

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast


In [61]:
biopData.iloc[range(20)]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast


In [62]:
biopData[biopData.Patient == 10]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
30,10,"Right breast, needle core biopsy",1. Stromal fibrosis. 2. No evidence of in situ...,Left Positive,right,breast
31,10,"Left breast, needle core biopsy","1. Invasive ductal carcinoma, SBR grade 2. 2. ...",Left Positive,left,breast


In [63]:
data.to_csv("Path Reports Complete.csv")
biopData.to_csv("Path Reports (By Specimens).csv")

## Labeling Path Reports 
Labels: Normal, Fibroadenoma, Ductal Carcinoma In Situ, Invasive Ductal Carcinoma, Invasive Carcinoma In Situ

In [25]:
# Negation Processing
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.,!:;"
    result = []
#Here rather then applying split, we can directly feed our extracted symptoms list
    #words = text.split()
    prev = None
    pprev = None
    for token in tokens:
        # stripped = word.strip(delchars)
        #stripped = word.strip(delims).lower()
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no"]):
            negation = not negation   

    return result

In [26]:
#biopData["Path Report Negated"] = biopData["Path Report"].apply(
#    lambda s: ' '.join(negate_sequence(nltk.word_tokenize(s))))
cancer_types = ["invasive lobular carcinoma", "invasive carcinoma in situ", 
                "invasive ductal carcinoma", "ductal carcinoma in situ", "carcinoma", 
                "adenocarcinoma", "fibroadenoma"]

cancer_re = ["(invasive)? \s lobular \s carcinoma (\s in \s situ)?", 
             "invasive \s carcinoma \s in [ -] situ", 
             "(invasive | infiltrating) \s duct(al)? \s carcinoma", 
             "duct(al)? \s carcinoma (\s in [ -] situ)?", 
             "(invasive | infiltrating | differen[\w]*) \s carcinoma (\s with \s ductal \s and \s lobular \s features)?",
             "adenocarcinoma", 
             "fibroadenoma"]

cancer_re = [re.compile(reg, re.VERBOSE) for reg in cancer_re]

In [27]:
biopData["Path Report"][0]

'1. Invasive lobular carcinoma in a background of lobular carcinoma in situ, negative margins; see comment. 2. Ductal carcinoma in situ, grade 2, negative margins; see comment. 3. Hyalinized fibroadenoma, radial scar and fibrocystic changes. 4. Biopsy site changes. 5. Nipple with no significant pathologic abnormality.'

In [28]:
re.search(cancer_re[3], 'Infiltrating ductal carcinoma, Grade I, extending to the inked surgical margin. See comment.'
)#.group(0) #biopData["Path Report"][0].lower()

<_sre.SRE_Match object; span=(13, 29), match='ductal carcinoma'>

In [29]:
def fuzzySubstrMatch(string, query, cutoff=0.7):
    s = dl.SequenceMatcher(None, string, query)
    matchRatio = sum(n for i,j,n in s.get_matching_blocks()) / float(len(query))
    return matchRatio > cutoff

def getCancerType(rep):
    rep = ' '.join(negate_sequence(nltk.word_tokenize(rep))).lower()
    
    for i in range(len(cancer_types)): #cancer in cancer_types:
        if re.search(cancer_re[i], rep): #cancer in rep: #fuzzySubstrMatch(rep, cancer, 0.95):
            return cancer_types[i]
    return "normal"

In [30]:
biopData["Cancer Type"] = biopData["Path Report"].apply(getCancerType)

In [31]:
bad_cancers = cancer_types[:-1]
def radLabel(patID):
    biops = biopData[biopData["Patient"] == patID]
    leftPos, rightPos = False, False
    for (i, biop) in biops.iterrows():
        if biop["Laterality"] == "left" and biop["Cancer Type"] in bad_cancers:
            leftPos = True
        if biop["Laterality"] == "right" and biop["Cancer Type"] in bad_cancers:
            rightPos = True  
    
    if leftPos and rightPos:
        return "Bilateral Positive"
    elif leftPos:
        return "Left Positive"
    elif rightPos:
        return "Right Positive"
    else:
        return "Negative"

In [32]:
labels = []
for patID in range(numPatients):
    labels.append(radLabel(patID))
data["Extracted Label"] = labels

In [33]:
data[["Positive" in label for label in data["Label"]]]
#data

Unnamed: 0,Path Report,Label,Split Path Report,Extracted Label
0,"A. Breast, left, simple mastectomy: 1. Invasiv...",Left Positive,"[Breast, left, simple mastectomy: 1. Invasive ...",Left Positive
1,"A. Left breast, ""mass at 12 o'clock 3 cm from ...",Left Positive,"[Left breast, ""mass at 12 o'clock 3 cm from ni...",Left Positive
3,"Right breast, excision of mammographic lesion:...",Right Positive,"[Right breast, excision of mammographic lesion...",Right Positive
4,"A. Sentinel lymph node #1, left axilla, biopsy...",Left Positive,"[Sentinel lymph node #1, left axilla, biopsy: ...",Left Positive
5,"A. Left breast, biopsy: 1. Infiltrating ductal...",Left Positive,"[Left breast, biopsy: 1. Infiltrating ductal c...",Left Positive
6,"Left breast, 9:30, needle core biopsy: Invasiv...",Left Positive,"[Left breast, 9:30, needle core biopsy: Invasi...",Left Positive
7,"A. Left breast, mastectomy: 1. Ductal carcinom...",Left Positive,"[Left breast, mastectomy: 1. Ductal carcinoma ...",Left Positive
8,"A. Lymph node, right axillary, sentinel node #...",Right Positive,"[Lymph node, right axillary, sentinel node #1,...",Right Positive
9,"Breast, left, ""12 o'clock,"" biopsy: Pleomorphi...",Left Positive,"[Breast, left, ""12 o'clock,"" biopsy: Pleomorph...",Left Positive
10,"A. Right breast, needle core biopsy: 1. Stroma...",Left Positive,"[Right breast, needle core biopsy: 1. Stromal ...",Left Positive


In [34]:
def accuracy(data):
    return sum(data["Label"] == data["Extracted Label"]) / data.shape[0]

def precision(data):
    """Of the samples that we identified as positive, the proportion of samples that were actually positive 
        [tP/number classified as positive]
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    class_pos = data[["Positive" in label for label in data["Extracted Label"]]]
    
    TP = sum(["Positive" in label for label in class_pos["Label"]])

    return TP / class_pos.shape[0]

def recall(data):
    """Proportion of positive samples that we caught [tP/number of actual positives]
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    
    actual_pos = data[["Positive" in label for label in data["Label"]]]
    
    TP = sum(["Positive" in label for label in actual_pos["Extracted Label"]])
    
    return TP / actual_pos.shape[0]

(accuracy(data), precision(data), recall(data))

(0.96849538294405213, 0.991672975018925, 0.9290780141843972)

In [35]:
data.iloc[3455]["Path Report"]
#re.search(cancer_re[4], "invasive carcinoma with mucinous features")#data.iloc[410]["Path Report"].lower())
#cancer_re[4] #Invasive carcinoma with ductal and lobular features

'A. "Right breast lesion," surgical biopsy: Poorly differentiated carcinoma, 0.9 cm greatest dimension, present at margins; see comment. B. Right breast capsule, capsulectomy: Consistent with capsule; no carcinoma identified. C. Left breast capsule, capsulectomy: Consistent with capsule; no carcinoma identified.'

## Naive Bayes Classifier

Use Naive Bayes to classify the path reports that regex can't correctly identify (labels = ilc, idc, fibro, etc.)
a.k.a. classify misSamples

In [36]:
# Reports that weren't classified correctly
misPatIDs = data[data["Label"] != data["Extracted Label"]].index.values
misSamples = biopData.iloc[[row for row in range(numSamples) if biopData.iloc[row]["Patient"] in misPatIDs]]
misSamples

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,Cancer Type
50,23,"Right breast, total skin-sparing mastectomy",1. Metaplastic carcinoma with chondroid differ...,Right Positive,right,breast,normal
51,23,"Sentinel lymph node #1, biopsy","Adipose and fibrovascular tissue, no lymph nod...",Right Positive,na,lymph node,normal
56,28,"Breast, needle-localization excision biopsy","Infiltrating ductal carcinoma, Grade I, extend...",Right Positive,na,breast,invasive ductal carcinoma
57,28,"Thigh, right, excision of mass",Lipoma.,Right Positive,right,na,normal
96,45,"Left Breast, Ultrasound Guided Fine Needle Asp...",Recurrent/recidual Papillary carcinoma in situ...,Left Positive,left,breast,normal
541,343,"Right breast, outer quadrant, lumpectomy","1. LCIS, pleomorphic type, 0.9 cm in maximum d...",Right Positive,right,breast,normal
553,348,"Left nipple, re-excision",Benign breast tissue with surgical site change...,Left Positive,left,breast,normal
554,348,"Left axillary contents, dissection",Metastatic carcinoma in one of eleven lymph no...,Left Positive,left,uterus,normal
566,355,"Right breast, 9:30, biopsy","DCIS, intermediate nuclear grade, solid and cr...",Right Positive,right,breast,normal
708,433,"Mass, right breast, excision","Infiltrating ductulo-lobular carcinoma, SBR gr...",Right Positive,right,breast,normal


In [37]:
# corrSamples = Correctly labeled samples
corrPatIDs = data[data["Label"] == data["Extracted Label"]].index.values
corrSamples = biopData.iloc[[row for row in range(numSamples) if biopData.iloc[row]["Patient"] in corrPatIDs]]
corrSamples = corrSamples.reset_index().drop("index", 1)
corrSamples

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,Cancer Type
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,invasive lobular carcinoma
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,normal
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,normal
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,normal
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,normal
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,invasive ductal carcinoma
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,normal
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,normal
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,normal
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,normal


# Featurize the path reports
- Develop a featurization of the correct reports
- Instead of just counting frequencies of all tokens, we will use n-grams, and identify the most useful words


## Attempt 1: Use entire vocabulary to determine feature set

In [38]:
def accuracy(classified, reference):
    assert len(classified) == len(reference)
    numData = len(classified)
    numClassifiedCorrect = sum([classified[i] == reference[i] for _ in range(len(reference))])
    return numClasifiedCorrect

def precision(classified, reference):
    """Of the samples that we identified as a x, the proportion of samples that were actually x 
        [tP/number classified as x]
    Args: lists of classified labels and reference labels"""
    labels = set(reference)
    
    precs = {}
    for label in labels:
        classifiedAsLabel = [i for i in range(len(classified)) if classified[i] == label]
        TP = sum([reference[i] == label for i in classifiedAsLabel])
        precs[label] = TP / len(classifiedAsLabel)
    return precs

def recall(classified, reference):
    """Of the samples that were x, the proportion of samples that we classified as x
        [tP/number classified as x]
    Args: lists of classified labels and reference labels"""
    labels = set(reference)
    
    recs = {}
    for label in labels:
        labeled = [i for i in range(len(reference)) if reference[i] == label]
        TP = sum([classified[i] == label for i in labeled])
        recs[label] = TP / len(labeled)
    return recs

def getPRBoW(pathRep, vocab):
    bow = {}
    for word in vocab:
        bow[word] = pathRep.count(' '.join(word))
    return bow

def test_feature_set(features):
    corrSamples["Path Report BoW"] = [getPRBoW(pathRep, features) for pathRep in corrSamples["Path Report"]]

    prFeatureSet = [(corrSamples.iloc[i]["Path Report BoW"], corrSamples.iloc[i]["Cancer Type"]) 
                for i in range(corrSamples.shape[0])]

    train, test = train_test_split(prFeatureSet, test_size=0.2)
    classifier = nltk.NaiveBayesClassifier.train(train)
    testSet = classifier.classify_many([rep[0] for rep in test])
    referenceSet = [rep[1] for rep in test]

    classifier.show_most_informative_features(30)
    return nltk.classify.accuracy(classifier, test), precision(testSet, referenceSet), recall(testSet, referenceSet)


In [39]:
test_feature_set(vocab)

NameError: name 'vocab' is not defined

## Attempt 2: Take the top k most common tokens

In [None]:
from collections import Counter

k = 2000
k_most_common_tokens = [token[0] for token in Counter(compiledReps).most_common(k)]

test_feature_set(k_most_common_tokens)

## Attempt 2: Take the top k most common tokens from each class

In [None]:
k = 500

features = []

corrSamples["Reports Tokenized"] = tokPathReps
for cancer in cancer_types:
    w_cancer = corrSamples[corrSamples["Cancer Type"] == cancer]
    cancer_reps_tok = w_cancer["Reports Tokenized"]
    cancer_tokens = [token for pathRep in cancer_reps_tok for token in pathRep]
    cancer_tokens_freq = Counter(cancer_tokens)
    features.extend([elem[0] for elem in cancer_tokens_freq.most_common(k)])
features = set(features)
features

In [None]:
corrSamples["Path Report BoW"] = [getPRBoW(pathRep, features) for pathRep in corrSamples["Path Report"]]

prFeatureSet = [(corrSamples.iloc[i]["Path Report BoW"], corrSamples.iloc[i]["Cancer Type"]) 
            for i in range(corrSamples.shape[0])]

train, test = train_test_split(prFeatureSet, test_size=0.2)
classifier = nltk.NaiveBayesClassifier.train(train)
testSet = classifier.classify_many([rep[0] for rep in test])
referenceSet = [rep[1] for rep in test]

classifier.show_most_informative_features(30)

In [None]:
nltk.classify.accuracy(classifier, test), precision(testSet, referenceSet), recall(testSet, referenceSet)

## Attempt 3: Featurize using 2-grams and 3-grams (take top k most common)

In [None]:
k = 500
from nltk.util import ngrams

tokPathReps = corrSamples["Path Report"].apply(lambda s: negate_sequence(nltk.word_tokenize(s)))
two_grams = [bigram for tokPathRep in tokPathReps for bigram in list(ngrams(tokPathRep, 2))]
three_grams = [trigram for tokPathRep in tokPathReps for trigram in list(ngrams(tokPathRep, 3))]
two_grams_set = set(two_grams)
three_grams_set = set(three_grams)

vocab_ngrams = vocab.union(two_grams_set, three_grams_set)

In [None]:
test_feature_set(vocab_ngrams)

## Attempt 4: finding most useful tokens via tf-idf

### Modify getCancerLabel to utilize NaiveBayesClassifier to catch weirdly phrased path reports

In [None]:
backupClassifier = nltk.NaiveBayesClassifier.train(prFeatureSet)

def getCancerTypeBayes(rep):
    rep = ' '.join(negate_sequence(nltk.word_tokenize(rep))).lower()
    
    for i in range(len(cancer_types)): #cancer in cancer_types:
        if re.search(cancer_re[i], rep): #cancer in rep: #fuzzySubstrMatch(rep, cancer, 0.95):
            return cancer_types[i]
        
    return backupClassifier.classify(getPRBoW(rep))

In [None]:
biopData["Cancer Type Bayes"] = biopData["Path Report"].apply(getCancerTypeBayes)

In [None]:
biopData

In [None]:
labels = []
for patID in range(numPatients):
    labels.append(radLabel(patID))
data["Extracted Label Bayes"] = labels

## Old: First Attempt and Using Naive Bayes on Everything

In [None]:
biopData


def getCancerLabel(biop):
    #patID = biop["Patient"]
    #label = data.iloc[patID]["Label"]
    label = biop["Patient Label"]
    laterality = biop["Laterality"]
    
    # To see possible labels run data.groupby("Label").count()
    
    if label == "Negative":
        return "Negative"
    if laterality == "left":
        if label == "Bilateral Positive" or label == "Left Positive":
            return "Positive"
        return "Negative"
    elif laterality == "right":
        if label == "Bilateral Positive" or label == "Right Positive":
            return "Positive"
        return "Negative"
    else:
        return None

biopData["Biopsy Label"] = [getCancerLabel(biopData.iloc[i]) for i in range(numSamples)]

In [None]:
biopData

In [None]:
biopData[biopData["Patient"] == 7]

In [None]:
nltk.word_tokenize("This will be a complex sentence; compound f.t.w.")

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

In [None]:
# Negating 
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.,!:;"
    result = []
#Here rather then applying split, we can directly feed our extracted symptoms list
    #words = text.split()
    prev = None
    pprev = None
    for token in tokens:
        # stripped = word.strip(delchars)
        #stripped = word.strip(delims).lower()
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no"]):
            negation = not negation   

    return result

In [None]:
biopData["Path Report Tokenized"] = biopData["Path Report"].apply(lambda s: negate_sequence(nltk.word_tokenize(s)))
# Can try double negation flip, set parameter double_neg_flip = True

In [None]:
# Constructing feature representation of Path Reports. The feature set is a dictionary where
# the keys are words, and the values are frequencies

compiledReps = []
for i in range(numSamples):
    compiledReps += biopData.iloc[i]["Path Report Tokenized"]

vocab = set(compiledReps)

def getPRBoW(pathRep):
    bow = {}
    for word in vocab:
        bow[word] = pathRep.count(word)
    return bow

biopData["Path Report BoW"] = biopData["Path Report Tokenized"].apply(getPRBoW)

In [None]:
breastData = biopData[biopData["Biopsy Source"] == "breast"].dropna()
num_brsamples = breastData.shape[0]

brfeatureset = [(breastData.iloc[i]["Path Report BoW"], breastData.iloc[i]["Biopsy Label"]) 
            for i in range(num_brsamples)]

In [None]:
breastData.reset_index()#.iloc[10]["Path Report"]

In [None]:
s = breastData.reset_index().iloc[5]["Path Report"]
s

In [None]:
"Hello".lower()

In [None]:
negate_sequence(nltk.word_tokenize(breastData.reset_index().iloc[14]["Path Report"]))

## Regex Matching/Preprocessing

In [None]:
numPos = sum(biopData["Biopsy Label"] == "Positive")
numNeg = sum(biopData["Biopsy Label"] == "Negative")
numPosBr = sum(breastData["Biopsy Label"] == "Positive")
numNegBr = sum(breastData["Biopsy Label"] == "Negative")
(numPos, numNeg, numPosBr, numNegBr)

In [None]:
# Simply just classify path reps with 
biopData[["carcinoma" in rep for rep in biopData["Path Report"]]]["Patient"]#.tolist().count("Positive")

## Naive Bayes Classifier

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(brfeatureset, test_size=0.2)

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [None]:
testSet = classifier.classify_many([rep[0] for rep in test])

In [None]:
referenceSet = [rep[1] for rep in test]

In [None]:
(testSet.count("Positive"), referenceSet.count("Positive"))

In [None]:
print(nltk.classify.accuracy(classifier, test))

In [None]:
classifier.show_most_informative_features(30)

In [None]:
def precision(results, reference):
    """Proportion of samples that we identified as positive that were actually positive tP/total positive
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    TP = sum([(results[i] == "Positive") and (reference[i] == "Positive") for i in range(len(reference))])
    FP = sum([(results[i] == "Positive") and (reference[i] == "Negative") for i in range(len(reference))])
    return TP / (TP + FP)

def recall(results, reference):
    """Proportion of positive samples that we caught tP/total actual positive
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    TP = sum([(results[i] == "Positive") and (reference[i] == "Positive") for i in range(len(reference))])
    FN = sum([(results[i] == "Negative") and (reference[i] == "Positive") for i in range(len(reference))])
    return TP / (TP + FN)

In [None]:
(precision(testSet, referenceSet), recall(testSet, referenceSet))

In [None]:
num_brsamples = breastData.shape[0]
num_brsamples

In [None]:
def cross_validate(reps):
    precs, recs = [], []
    for _ in range(reps):
        train, test = train_test_split(brfeatureset, test_size=0.2)
        classifier = nltk.NaiveBayesClassifier.train(train)
        results = classifier.classify_many([rep[0] for rep in test])
        reference = [rep[1] for rep in test]
        precs.append(precision(results, reference))
        recs.append(recall(results, reference))
    plt.plot(range(1, reps + 1), precs)
    plt.plot(range(1, reps + 1), recs)
    plt.legend(['precision', 'recall'])
    plt.show()

In [None]:
cross_validate(3)

In [None]:
np.random.choice([1, 2, 3], 5, replacement = False)

In [None]:
[1, 2, 3].index(3)