In [3]:
import pandas as pd

# Deidentifying notes with python

### In this group project, you're going to get a dataset where the Protected Health Infromation (PHI) elements are labeled. The goal is to write python logic that allows you to capture the maximum number of PHI elements in the note while minimizing the number of non-PHI elements. You can see how your solution then generalizes to a second note. I hope this exercise is useful for:
* exposing you to a data science use case
* challenging you to think about how to approach multi step programming problems
* expanding your ability in python via loops, functions, string and list methods, etc.
* getting you to think about how to evaluate success

## note

In [6]:
# load csv file of note samples into dataframe - we'll be using the pandas package a lot more
notes = pd.read_csv('phi_labeled_notes.csv')
notes

Unnamed: 0,note,note_id
0,\n\n977146916\nHLGMC\n2878891\n022690\n01/27/1...,641
1,\n\n123547445\nFIH\n7111426\n47933/f911\n55734...,640


In [93]:
# load the first note into a variable and preview the note as its formatted
note_id = 640
# create a boolean mask
mask = notes.note_id == note_id
# slice the dataframe for where only the rows in the boolean mask are true, e.g. where the note_id is 640
first_note = notes[mask].note.values[0]
# print out first 1404 characters
first_note[:]

'\n\n123547445\nFIH\n7111426\n47933/f911\n557344\n11/19/1994 12:00:00 AM\nDischarge Summary\nUnsigned\nDIS\nReport Status :\nUnsigned\nADMISSION DATE :\n11/19/94\nDISCHARGE DATE :\n11/28/94\nADMISSION DIAGNOSIS :\nAspiration pneumonia , esophageal laceration .\nHISTORY OF PRESENT ILLNESS :\nMr. Blind is a 79-year-old white white male with a history of diabetes mellitus , inferior myocardial infarction , who underwent open repair of his increased diverticulum November 13th at Sephsandpot Center .\nThe patient developed hematemesis November 15th and was intubated for respiratory distress .\nHe was transferred to the Valtawnprinceel Community Memorial Hospital for endoscopy and esophagoscopy on the 16th of November which showed a 2 cm linear tear of the esophagus at 30 to 32 cm .\nThe patient \'s hematocrit was stable and he was given no further intervention .\nThe patient attempted a gastrografin swallow on the 21st , but was unable to cooperate with probable aspiration .\nThe patient al

In [94]:
# observe the formatting difference when we print the note
print(first_note[:])



123547445
FIH
7111426
47933/f911
557344
11/19/1994 12:00:00 AM
Discharge Summary
Unsigned
DIS
Report Status :
Unsigned
ADMISSION DATE :
11/19/94
DISCHARGE DATE :
11/28/94
ADMISSION DIAGNOSIS :
Aspiration pneumonia , esophageal laceration .
HISTORY OF PRESENT ILLNESS :
Mr. Blind is a 79-year-old white white male with a history of diabetes mellitus , inferior myocardial infarction , who underwent open repair of his increased diverticulum November 13th at Sephsandpot Center .
The patient developed hematemesis November 15th and was intubated for respiratory distress .
He was transferred to the Valtawnprinceel Community Memorial Hospital for endoscopy and esophagoscopy on the 16th of November which showed a 2 cm linear tear of the esophagus at 30 to 32 cm .
The patient 's hematocrit was stable and he was given no further intervention .
The patient attempted a gastrografin swallow on the 21st , but was unable to cooperate with probable aspiration .
The patient also had been receiving gener

## examples of phi in the note

In [3]:
# load answer key into dataframe
labels = pd.read_csv('phi_labeled_note_answer_key_note.csv')
labels.sample(3)

Unnamed: 0,PHI,type,note_id
35,January 31,DATE,641
21,ZIE M. ZONE,DOCTOR,640
39,AZEL USANNE WALL,DOCTOR,641


In [7]:
# isolate labels for first note
mask = labels.note_id==note_id
labels[mask]

Unnamed: 0,PHI,type,note_id
0,11/08,DATE,640
1,11/05,DATE,640
2,31st,DATE,640
3,26th,DATE,640
4,26th,DATE,640
5,26th,DATE,640
6,26th,DATE,640
7,25th,DATE,640
8,25th,DATE,640
9,25th,DATE,640


## resources

In [21]:
# one way to find the indices of each occurrence of a term in a list
[i for i, x in enumerate(first_note.split()) if x.lower() == "hospital"]''

[92, 214, 801]

In [36]:
# one way to find the indices of each occurrence of a term in a list
for i, x in enumerate(first_note.split()):
    

123547445
FIH
7111426
47933/f911
557344
11/19/1994
12:00:00
AM
Discharge
Summary
Unsigned
DIS
Report
Status
:
Unsigned
ADMISSION
DATE
:
11/19/94
DISCHARGE
DATE
:
11/28/94
ADMISSION
DIAGNOSIS
:
Aspiration
pneumonia
,
esophageal
laceration
.
HISTORY
OF
PRESENT
ILLNESS
:
Mr.
Blind
is
a
79-year-old
white
white
male
with
a
history
of
diabetes
mellitus
,
inferior
myocardial
infarction
,
who
underwent
open
repair
of
his
increased
diverticulum
November
13th
at
Sephsandpot
Center
.
The
patient
developed
hematemesis
November
15th
and
was
intubated
for
respiratory
distress
.
He
was
transferred
to
the
Valtawnprinceel
Community
Memorial
Hospital
for
endoscopy
and
esophagoscopy
on
the
16th
of
November
which
showed
a
2
cm
linear
tear
of
the
esophagus
at
30
to
32
cm
.
The
patient
's
hematocrit
was
stable
and
he
was
given
no
further
intervention
.
The
patient
attempted
a
gastrografin
swallow
on
the
21st
,
but
was
unable
to
cooperate
with
probable
aspiration
.
The
patient
also
had
been
receiving
generou

In [90]:
def find_hospitals(note):
    
    # set phi_list
    phi_list = []
    
    # get tokens
    tokens = note.split()
    
    # set terms
    terms = ['Hospital','Center']
    
    # first check - length 3, uppercase, last character is an H
    abbreviations = [x for x in tokens if (len(x)==3) and (x.isupper())and (x[-1]=='H')]
    
    # look for instances of hospitals and center
    for e in terms:
        
        # get indices for the search term
        indices = [i for i, x in enumerate(tokens) if (x==e)]
        
        # iterate through indices
        for some_index in indices:
            
            token_name = e
            
            # iterate through range
            for some_num in range(1,20):
                
                # check to see if the token is capitalized
                previous_token = tokens[some_index-some_num]
                
                # if previous token is capitalized
                
                if previous_token.istitle():
                    
                    # concatenate previous token with token
                    token_name = previous_token + ' ' + token_name
                    
                else:
                    break
                    
            if token_name==e:
                continue
            else:
                phi_list.append(token_name)
                
    #concatenate lists
    phi_list+=abbreviations
    
    # deduplicate
    return list(set(phi_list))


find_hospitals(first_note) 

['Sephsandpot Center',
 'NPH',
 'OLH',
 'Valtawnprinceel Community Memorial Hospital',
 'LDH',
 'FIH',
 'Em Nysonken Medical Center']

In [102]:
tokens = first_note.split("\n")
print(tokens)
weirdterm = "TR :"
weirdterm_index = [x for x, i in enumerate(tokens) if (i == weirdterm)]
new_var = weirdterm_index[0] + 1
print (weirdterm_index)
print(tokens[new_var])

['', '', '123547445', 'FIH', '7111426', '47933/f911', '557344', '11/19/1994 12:00:00 AM', 'Discharge Summary', 'Unsigned', 'DIS', 'Report Status :', 'Unsigned', 'ADMISSION DATE :', '11/19/94', 'DISCHARGE DATE :', '11/28/94', 'ADMISSION DIAGNOSIS :', 'Aspiration pneumonia , esophageal laceration .', 'HISTORY OF PRESENT ILLNESS :', 'Mr. Blind is a 79-year-old white white male with a history of diabetes mellitus , inferior myocardial infarction , who underwent open repair of his increased diverticulum November 13th at Sephsandpot Center .', 'The patient developed hematemesis November 15th and was intubated for respiratory distress .', 'He was transferred to the Valtawnprinceel Community Memorial Hospital for endoscopy and esophagoscopy on the 16th of November which showed a 2 cm linear tear of the esophagus at 30 to 32 cm .', "The patient 's hematocrit was stable and he was given no further intervention .", 'The patient attempted a gastrografin swallow on the 21st , but was unable to coop

In [None]:
def find_doctor(note):
    
    # set phi_list
    phi_list = []
    
    # get tokens
    tokens = note.split("\n")
    
    # set terms
    weirdterm = ['TR :']
    weirdterm_index = [x for x, i in enumerate(tokens) if (i == weirdterm)]
    #index of the weirdterm
    newindex = weirdterm_index + 1
    phi_list.append(tokens[newindex])
    
    doctorterm = ["M.D."]
    
    # first check - length 3, uppercase, last character is an H
    abbreviations = [x for x in tokens if (len(x)==3) and (x.isupper())and (x[-1]=='H')]
    
    # look for instances of hospitals and center
    for e in terms:
        
        # get indices for the search term
        indices = [i for i, x in enumerate(tokens) if (x==e)]
        
        # iterate through indices
        for some_index in indices:
            
            token_name = e
            
            # iterate through range
            for some_num in range(1,20):
                
                # check to see if the token is capitalized
                previous_token = tokens[some_index-some_num]
                
                # if previous token is capitalized
                
                if previous_token.istitle():
                    
                    # concatenate previous token with token
                    token_name = previous_token + ' ' + token_name
                    
                else:
                    break
                    
            if token_name==e:
                continue
            else:
                phi_list.append(token_name)
                
    #concatenate lists
    phi_list+=abbreviations
    
    # deduplicate
    return list(set(phi_list))


find_hospitals(first_note) 

In [54]:
HospitalName = "AYH"
HospitalName.isupper()
HospitalName[2] == "H"
len(HospitalName)
for count,x in enumerate("hello"):
    print (count,x)

0 h
1 e
2 l
3 l
4 o


In [None]:
# split may be a helpful string method
first_note.split()[87:97]

## importance of iterating 

### create a hypothesis to approach a problem
#### all the ID values start off numeric, so we filter for tokens that start numerically and see IDs as well as non PHI values

In [None]:
[x for x in first_note.split() if x[:5].isdigit()==True]

# the list comprehension above is the same as
#new_list = []
#for x in first_note.split():
#    if if x[:5].isdigit()==True:
#        new_list.append(x)

#### so we refine the logic to fit the PHI values but not other values

In [None]:

[x for x in first_note.split() if (x[:5].isdigit()==True) and (float(x[:5])>1000)]

## assignment

### regardless of your ability level with python, pseudo code a plan to approach finding each type of PHI. Then try and code your plan, capturing one type of PHI at a time. If you are successful with the first note, see if your solution generalizes to the second note. If not, use what you did or didn't capture to revise your previous attempt.