In [3]:
import sqlite3
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [4]:
db = sqlite3.connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\nlp_annotator-dev\app\db.sqlite3")

## Tables of Use

### Data to Feed into Model
1. gui_alwaysregex to get all the always patterns
2. gui_sentenceannotation to get labels
3. gui_sentence to get sentences
4. gui_sentence_always_regex to get all sentences that matched with always patterns (gives extra 1284 sentences)

### EDA Purposes
1. gui_sentencealwaysregex to see which always expressions get flagged most often
2. gui_seedregex to get list of matched phrases
2. gui_sentenceseedregex to see which expressions in general get flagged most often

## Extracting Data for Model

In [5]:
always_patterns = pd.read_sql_query("Select * from gui_alwaysregex", db)
always_patterns

Unnamed: 0,id,Pattern,Annotation,CreatedBy_id
0,2,MOCA.*22/30,Y,3
1,3,has memory difficulties,Y,3
2,4,poor.*working memory,Y,3
3,5,MOCA.*28/30,Y,3
4,6,Reports.*short.term.*memory,Y,3
5,7,IMPRESSION.*Memory loss,Y,3
6,8,cognitive deficits,Y,3
7,10,MOCA.*13/30,Y,3
8,12,No\s*memory\s*concerns,N,3
9,13,signs.*suspicious.*dementia,Y,3


In [6]:
labels = pd.read_sql_query("Select * from gui_sentenceannotation", db)
#labels = labels.sort_values(["Sentence_id"])
labels

Unnamed: 0,id,Label,Annotator_id,Sentence_id
0,1,NTR,3,2990
1,2,YES,3,3003
2,3,YES,3,3009
3,4,NTR,3,3013
4,5,NTR,3,3035
5,6,NTR,3,3045
6,7,NTR,3,3046
7,8,NTR,3,3051
8,9,NTR,3,3054
9,10,NTR,3,3055


In [7]:
sentences = pd.read_sql_query("Select * from gui_sentence", db)
sentences

Unnamed: 0,id,Contents,Note_id
0,1,"""ession alone in the meta-analysis. We discuss...",1
1,2,""" ------- sis over her R bra strap, which was ...",2
2,3,""" ------- l obstruction ASSOCIATED DIAGNOSES S...",3
3,4,""" ------- (156 lb) 09/19/17 71.4 kg (157 lb 6...",4
4,5,"""ut answer. Optho notes from recent outpatient...",5
...,...,...,...
279219,279220,"""''she took a break' and wants to discuss with...",274442
279220,279221,"""'all of her medications, including aspirin. D...",274443
279221,279222,"""'Patient, Chart, daughter Olga Mode of contac...",274444
279222,279223,"""'Patient, Chart, daughter Olga Mode of contac...",274445


In [8]:
sentences["len"] = sentences["Contents"].str.len()

In [45]:
sentences[sentences['len'] < 400]

Unnamed: 0,id,Contents,Note_id,len
41,42,"""---------------------------------------------...",42,364
146,147,"""---------------------------------------------...",147,350
169,170,"""---------------------------------------------...",170,323
243,244,"""---------------------------------------------...",244,361
272,273,"""---------------------------------------------...",273,383
...,...,...,...,...
279219,279220,"""''she took a break' and wants to discuss with...",274442,198
279220,279221,"""'all of her medications, including aspirin. D...",274443,202
279221,279222,"""'Patient, Chart, daughter Olga Mode of contac...",274444,197
279222,279223,"""'Patient, Chart, daughter Olga Mode of contac...",274445,197


In [9]:
always_pattern_sentences = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
always_pattern_sentences

Unnamed: 0,id,AlwaysRegex_id,Sentence_id
0,53,2,377
1,54,2,20965
2,55,2,532
3,56,2,29481
4,57,2,2909
...,...,...,...
8628,9618,37,176918
8629,9619,37,184583
8630,9620,37,248788
8631,9621,38,3086


## Constructing Model Matrix

In [10]:
note = pd.read_sql_query("Select * from gui_note", db)
note

Unnamed: 0,id,PatientEncounterID,NoteID,Date,PatientID
0,1,3242118020.0,2370049295,2019-03-14,Z6352398
1,2,3181499472.0,1916263837,2018-03-28,Z6352922
2,3,3018789480.0,267681869,2015-01-09,Z6353136
3,4,3172504002.0,1682366001,2017-10-03,Z6353461
4,5,3165982636.0,1853759390,2018-02-07,Z6353764
...,...,...,...,...,...
274441,274442,3285425453.0,3960511043,2020-04-28,Z16320629
274442,274443,3305416153.0,4585859437,2020-09-22,Z16320629
274443,274444,3327429844.0,5203067949,2021-01-25,Z16320629
274444,274445,3349754670.0,5673786544,2021-04-20,Z16320629


In [11]:
print(note.loc[note['id'] == 8481]['PatientID'])

8480    Z6448542
Name: PatientID, dtype: object


In [52]:
def clean_sequence(seq):
    #getting rid of special characters
    seq_no_special_chars = ""
    for character in seq:
        if character.isalnum():
            seq_no_special_chars += character
        elif character == " ":
            seq_no_special_chars += character
            
    # getting rid of numbers
    seq_no_nums = ""
    for character in seq_no_special_chars:
        if not character.isdigit():
            seq_no_nums += character
            
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_nums.split())
    
    #Stemming words
#     stemmer = PorterStemmer()
#     tokens = word_tokenize(seq_no_spaces)
#     sequences_clean = []
#     for word in tokens:
#         if word != " ":
#             stem_word = stemmer.stem(word)  # stemming word
#         sequences_clean.append(stem_word)
    
#     final = " "
#     final = final.join(sequences_clean)
    
    return seq_no_spaces.lower()

In [53]:
print(sentences["Contents"][1234], "\n")
print(clean_sequence(sentences["Contents"][1234]))

"eted. Social History reviewed: Living situation: Lives with Family Diet: Low Sodium Diet Exercise: Moderate Intensity 3-4 days a week ADL: No issues with: dressing, bathing, walking, shopping, housekeeping and financial management Services: none Healthcare proxy: No Advance Care/End of Life Planning Discussed: Yes Depression Screening PHQ-2 Score: 0 Cognition Negative: no evidence of cognitive decline noted by patient or family; no memory problems causing dysfunction in daily activities Falls risk Time to rise from, walk 10 feet, turn and return and sit: 5 seconds STRIDE Falls Risk Falls Risk 3/1/2017 Have you fallen and hurt yourself in the past year? N Have you fallen 2 or more times in the past year? N Are you afraid that you might fall because of balance or walking problems? N Ho" 

eted social history reviewed living situation lives with family diet low sodium diet exercise moderate intensity days a week adl no issues with dressing bathing walking shopping housekeeping and financ

In [54]:
#getting sentence ids
sentence_id = labels["Sentence_id"].to_list()
#print(len(sentence_id))
sentence_id += always_pattern_sentences["Sentence_id"].to_list()
#print(len(sentence_id))

# defining model matrix and setting dimensions
model_matrix = pd.DataFrame(columns = ["patient_id", "sequence", "label"])
model_matrix["patient_id"] = [0] * int(len(sentence_id))
model_matrix["sequence"] = [" "] * int(len(sentence_id))
model_matrix["label"] = [" "] * int(len(sentence_id))

In [55]:
#extracting necessary data
idx = 0
for i in range(int(len(labels["Sentence_id"].to_list()))):
    #if (i < len(labels["Sentence_id"].to_list())):
    # getting all info from sentences that were manually annotated
    curr_note_id = sentences[sentences['id'] == sentence_id[i]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    curr_seq = sentences[sentences['Note_id'] == sentence_id[i]]['Contents'].values
    curr_label = labels[labels["Sentence_id"] == sentence_id[i]]['Label'].values
    
    curr_seq[0] = clean_sequence(curr_seq[0])
    
    model_matrix.iloc[i] = (curr_patient_id[0], curr_seq[0], curr_label[0])
    
    idx += 1

#print(idx) 

j = 1    
while (idx < len(sentence_id)): 
    #getting info from sentences that got auto annotated through always pattern matches
    #curr_sentence_id = always_pattern_sentences[always_pattern_sentences['id'] == j]['Sentence_id'].values
    curr_seq = sentences[sentences['id'] == sentence_id[idx]]['Contents'].values
    #print(curr_seq[0])
    curr_note_id = sentences[sentences['id'] == sentence_id[idx]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    #print(curr_patient_id)
    curr_always_regex_id = always_pattern_sentences[always_pattern_sentences['Sentence_id'] == sentence_id[idx]]['AlwaysRegex_id'].values
    #print(curr_always_regex_id)
    curr_label = always_patterns[always_patterns['id'] == curr_always_regex_id[0]]['Annotation'].values
    
    curr_seq[0] = clean_sequence(curr_seq[0])
    
    model_matrix.iloc[idx] = (curr_patient_id[0], curr_seq[0], curr_label[0])

    j += 1
    idx += 1
        
# yes = model_matrix[model_matrix['label'] == 3]
# neither = model_matrix[model_matrix['label'] == 2]
# no = model_matrix[model_matrix['label'] == 1]

# yes = yes.to_csv("yes.csv")
# neither = neither.to_csv("neither.csv")
# no = no.to_csv("no.csv"

In [56]:
#converting yes,no,ntr labels to numerical equivalents
mappings = {"NO" : 0, "NTR" : 1, "YES" : 2, "N": 0, "T": 1, "Y": 2}
model_matrix.label = [mappings[item] for item in model_matrix.label]

model_matrix = model_matrix.sample(frac=1).reset_index(drop=True) #shuffling model_matrix

In [58]:
model_matrix

Unnamed: 0,patient_id,sequence,label
0,Z8170308,medications allergies she is allergic to oxyco...,1
1,Z9021665,follow commands follows step commands follow c...,0
2,Z8485823,cancer brother testicular cancer u coronary ar...,1
3,Z10380050,are progressing faster he is very motivated to...,2
4,Z6985118,relation age of onset u cirrhosis father cirrh...,1
...,...,...,...
8651,Z7869100,story bilateral knee arthroscopy u vasectomy f...,1
8652,Z8631790,female who was evaluated today due to language...,2
8653,Z15536786,list items addressed this visit mild cognitive...,2
8654,Z7156572,ors or hesitation language was fluent without ...,0


In [57]:
model_matrix.to_csv(r"../Modeling/input.csv")

## Verification of Results

In [32]:
clean_seq = clean_sequence(str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0]))
model_matrix[model_matrix['sequence'] == clean_seq]

Unnamed: 0,patient_id,sequence,label,len
2324,Z12144959,phenomenology of sx and pt preference she woul...,1,752


In [31]:
s_id = sentences[sentences['Contents'] == str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0])]['id'].values[0]
labels[labels['Sentence_id'] == s_id]['Label'].values[0]

'NTR'

In [62]:
#exporting data to csv's
model_matrix.to_csv("../Modeling/Data/input.csv")

## EDA

In [42]:
regex_phrases = pd.read_sql_query("Select * from gui_seedregex", db)
phrase_matches = pd.read_sql_query("Select * from gui_sentenceseedregex", db)
freq = phrase_matches['SeedRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", regex_phrases[regex_phrases['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1

Match # 1
Id:  3
Pattern: ['dementia']
Count:  4135 

Match # 2
Id:  5
Pattern: ['Dementia']
Count:  3521 

Match # 3
Id:  7
Pattern: ["Alzheimer's disease"]
Count:  3311 

Match # 4
Id:  15
Pattern: ['cognitive impairment']
Count:  1907 

Match # 5
Id:  11
Pattern: ['memory loss']
Count:  1662 

Match # 6
Id:  13
Pattern: ['cognitive decline']
Count:  1211 

Match # 7
Id:  2
Pattern: ['memory problem']
Count:  906 

Match # 8
Id:  63
Pattern: ['Memory loss']
Count:  577 

Match # 9
Id:  26
Pattern: ['cognitive deficits']
Count:  543 

Match # 10
Id:  16
Pattern: ['Mild cognitive impairment']
Count:  499 

Match # 11
Id:  24
Pattern: ['memory impairment']
Count:  494 

Match # 12
Id:  121
Pattern: ['neurocognitive disorder']
Count:  400 

Match # 13
Id:  38
Pattern: ["other Alzheimer's disease"]
Count:  385 

Match # 14
Id:  12
Pattern: ['vascular dementia']
Count:  378 

Match # 15
Id:  19
Pattern: ["Alzheimer's dementia"]
Count:  372 

Match # 16
Id:  22
Pattern: ['memory deficit']
C

Id:  39
Pattern: ['Memory disorder']
Count:  9 

Match # 148
Id:  103
Pattern: ['Impaired insight']
Count:  9 

Match # 149
Id:  242
Pattern: ['Minor neurocognitive disorder']
Count:  9 

Match # 150
Id:  211
Pattern: ['ALZHEIMERS']
Count:  8 

Match # 151
Id:  186
Pattern: ['MEMORY DEFICIT']
Count:  8 

Match # 152
Id:  162
Pattern: ['impaired orientation']
Count:  8 

Match # 153
Id:  204
Pattern: ["early onset alzheimer's dementia"]
Count:  8 

Match # 154
Id:  230
Pattern: ['COGNITIVE DECLINE']
Count:  8 

Match # 155
Id:  198
Pattern: ['Mixed dementia']
Count:  8 

Match # 156
Id:  182
Pattern: ['primary progressive aphasia']
Count:  8 

Match # 157
Id:  165
Pattern: ['History of Dementia']
Count:  8 

Match # 158
Id:  35
Pattern: ['Dementia with Lewy Bodies']
Count:  8 

Match # 159
Id:  179
Pattern: ['Impaired memory']
Count:  7 

Match # 160
Id:  205
Pattern: ['Advanced Dementia']
Count:  7 

Match # 161
Id:  222
Pattern: ['Alzheimers disease']
Count:  7 

Match # 162
Id:  223


In [44]:
# Printing out frequencies of always pattern matches 
always_pattern_matches = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
freq = always_pattern_matches['AlwaysRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", always_patterns[always_patterns['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1

Match # 1
Id:  2
Pattern: ['Dementia Mother']
Count:  643 

Match # 2
Id:  4
Pattern: ['on Aricept']
Count:  41 

Match # 3
Id:  13
Pattern: ['Global CDR was 0.5']
Count:  29 

Match # 4
Id:  10
Pattern: ['Neurontin.*cause of.*cognitive dysfunction']
Count:  28 

Match # 5
Id:  9
Pattern: ['DIAGNOSES.*Memory loss']
Count:  27 

Match # 6
Id:  16
Pattern: ['patient([^\\.]*)diagnosis([^\\.]*)• Frontotemporal dementia']
Count:  27 

Match # 7
Id:  17
Pattern: ['contributing([^\\.]*)memory([^\\.]*)impairment']
Count:  27 

Match # 8
Id:  12
Pattern: ['cognitive\\s*impairment\\s*only,\\s*resolved']
Count:  26 

Match # 9
Id:  3
Pattern: ['challenges with cognitive deficits']
Count:  10 

Match # 10
Id:  7
Pattern: []
Count:  9 

Match # 11
Id:  14
Pattern: ['(he|she|they) (is) (co-followed(\\s*)) (by the (cognitive disorders|cognitive disorder|cognitive impairment) clinic)']
Count:  4 

Match # 12
Id:  15
Pattern: ['continue([^\\.]*)difficulty([^\\.]*)memory']
Count:  2 

Match # 13
Id:  1
