In [138]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [139]:
db = sqlite3.connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\SLAT\app\db - Copy.sqlite3")

## Tables of Use

### Data to Feed into Model
1. gui_alwaysregex to get all the always patterns
2. gui_sentenceannotation to get labels
3. gui_sentence to get sentences
4. gui_sentence_always_regex to get all sentences that matched with always patterns (gives extra 1284 sentences)

### EDA Purposes
1. gui_sentencealwaysregex to see which always expressions get flagged most often
2. gui_seedregex to get list of matched phrases
2. gui_sentenceseedregex to see which expressions in general get flagged most often

## Extracting Data for Model

In [140]:
always_patterns = pd.read_sql_query("Select * from gui_alwaysregex", db)
always_patterns

Unnamed: 0,id,Pattern,Annotation,CreatedBy_id
0,2,MOCA.*22/30,Y,3
1,3,has memory difficulties,Y,3
2,4,poor.*working memory,Y,3
3,5,MOCA.*28/30,Y,3
4,6,Reports.*short.term.*memory,Y,3
5,7,IMPRESSION.*Memory loss,Y,3
6,8,cognitive deficits,Y,3
7,10,MOCA.*13/30,Y,3
8,12,No\s*memory\s*concerns,N,3
9,13,signs.*suspicious.*dementia,Y,3


In [26]:
always_patterns.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv", index = False)

In [141]:
labels = pd.read_sql_query("Select * from gui_sentenceannotation", db)
#labels = labels.sort_values(["Sentence_id"])
labels

Unnamed: 0,id,Label,Annotator_id,Sentence_id
0,1,NTR,3,2990
1,2,YES,3,3003
2,3,YES,3,3009
3,4,NTR,3,3013
4,5,NTR,3,3035
5,6,NTR,3,3045
6,7,NTR,3,3046
7,8,NTR,3,3051
8,9,NTR,3,3054
9,10,NTR,3,3055


In [142]:
sentences = pd.read_sql_query("Select * from gui_sentence", db)
sentences

Unnamed: 0,id,Contents,Note_id
0,1,"""ession alone in the meta-analysis. We discuss...",1
1,2,""" ------- sis over her R bra strap, which was ...",2
2,3,""" ------- l obstruction ASSOCIATED DIAGNOSES S...",3
3,4,""" ------- (156 lb) 09/19/17 71.4 kg (157 lb 6...",4
4,5,"""ut answer. Optho notes from recent outpatient...",5
...,...,...,...
279219,279220,"""------- se \""she took a break\"" and wants to ...",274442
279220,279221,"""------- all of her medications, including as...",274443
279221,279222,"""------- ? Patient, Chart, daughter Olga Mode ...",274444
279222,279223,"""------- ? Patient, Chart, daughter Olga Mode ...",274445


In [143]:
sentences[sentences['Contents'].str.len() < 700]

Unnamed: 0,id,Contents,Note_id
24,25,"""---------------------------------------------...",25
41,42,"""---------------------------------------------...",42
57,58,"""---------------------------------------------...",58
60,61,"""---------------------------------------------...",61
71,72,"""---------------------------------------------...",72
...,...,...,...
238814,238815,"""---------------------------------------------...",234037
238824,238825,"""---------------------------------------------...",234047
238826,238827,"""---------------------------------------------...",234049
238867,238868,"""---------------------------------------------...",234090


In [144]:
always_pattern_sentences = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
always_pattern_sentences

Unnamed: 0,id,AlwaysRegex_id,Sentence_id
0,53,2,377
1,54,2,20965
2,55,2,532
3,56,2,29481
4,57,2,2909
...,...,...,...
9497,11041,39,277900
9498,11042,39,277901
9499,11043,39,277905
9500,11044,39,278236


## Constructing Model Matrix

In [145]:
note = pd.read_sql_query("Select * from gui_note", db)
note

Unnamed: 0,id,PatientEncounterID,NoteID,Date,PatientID
0,1,3242118020.0,2370049295,2019-03-14,Z6352398
1,2,3181499472.0,1916263837,2018-03-28,Z6352922
2,3,3018789480.0,267681869,2015-01-09,Z6353136
3,4,3172504002.0,1682366001,2017-10-03,Z6353461
4,5,3165982636.0,1853759390,2018-02-07,Z6353764
...,...,...,...,...,...
274441,274442,3285425453.0,3960511043,2020-04-28,Z16320629
274442,274443,3305416153.0,4585859437,2020-09-22,Z16320629
274443,274444,3327429844.0,5203067949,2021-01-25,Z16320629
274444,274445,3349754670.0,5673786544,2021-04-20,Z16320629


In [146]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
            
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [147]:
#getting sentence ids
sentence_id = labels["Sentence_id"].to_list()
#print(len(sentence_id))
sentence_id += always_pattern_sentences["Sentence_id"].to_list()
#print(len(sentence_id))

# defining model matrix and setting dimensions
model_matrix = pd.DataFrame(columns = ["patient_id", "sequence", "original", "label"])
model_matrix["patient_id"] = [0] * int(len(sentence_id))
#model_matrix["note_id"] = [0] * int(len(sentence_id))
model_matrix["sequence"] = [" "] * int(len(sentence_id))
model_matrix["original"] = [" "] * int(len(sentence_id))
model_matrix["label"] = [" "] * int(len(sentence_id))
model_matrix["sentence_id"] = [" "] * int(len(sentence_id))

In [148]:
#extracting necessary data
idx = 0
for i in range(int(len(labels["Sentence_id"].to_list()))):
    #if (i < len(labels["Sentence_id"].to_list())):
    # getting all info from sentences that were manually annotated
    curr_note_id = sentences[sentences['id'] == sentence_id[i]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    curr_seq = sentences[sentences['Note_id'] == sentence_id[i]]['Contents'].values
    curr_label = labels[labels["Sentence_id"] == sentence_id[i]]['Label'].values
    original = curr_seq[0]
    curr_seq[0] = clean_sequence(curr_seq[0])
    # print(labels[labels["Sentence_id"] == sentence_id[i]]["Sentence_id"])
    model_matrix.iloc[i] = (curr_patient_id[0], curr_seq[0], original, curr_label[0], sentence_id)

    idx += 1

#print(idx) 

j = 1    
while (idx < len(sentence_id)): 
    #getting info from sentences that got auto annotated through always pattern matches
    #curr_sentence_id = always_pattern_sentences[always_pattern_sentences['id'] == j]['Sentence_id'].values
    curr_seq = sentences[sentences['id'] == sentence_id[idx]]['Contents'].values
    #print(curr_seq[0])
    curr_note_id = sentences[sentences['id'] == sentence_id[idx]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    #print(curr_patient_id)
    curr_always_regex_id = always_pattern_sentences[always_pattern_sentences['Sentence_id'] == sentence_id[idx]]['AlwaysRegex_id'].values
    #print(curr_always_regex_id)
    curr_label = always_patterns[always_patterns['id'] == curr_always_regex_id[0]]['Annotation'].values
    original = curr_seq[0]
    curr_seq[0] = clean_sequence(curr_seq[0])
    sentence_id = always_pattern_sentences[always_pattern_sentences['Sentence_id'] == sentence_id[idx]]["Sentence_id"]
    model_matrix.iloc[idx] = (curr_patient_id[0], curr_seq[0], original, curr_label[0], sentence_id)

    j += 1
    idx += 1

# yes = model_matrix[model_matrix['label'] == 3]
# neither = model_matrix[model_matrix['label'] == 2]
# no = model_matrix[model_matrix['label'] == 1]

# yes = yes.to_csv("yes.csv")
# neither = neither.to_csv("neither.csv")
# no = no.to_csv("no.csv"

In [150]:
model_matrix.at[0, "original"]

'" ------- phenomenology of sx (and pt preference) she would very unllikely benefit from medication. Best plan going forward will be to work on alliance, refer to SW in her area for support, and encourage treatment of medical issues, particularly weight loss and obesity. Will also need to r/o neurocognitive and other organic contributors to sx - needs a psychosis workup to rule out non-psych etiologies (p ------- ganic cause). Plan: - will pursue referral to local social services - RTC in 6 weeks - will perform MoCA, labs, suggest imaging at that time - attg of record Dr Felicia Smith Outpatient Attending Note Pt seen and examined with Dr. King. I agree with his history, evaluation, assessment and plan in detail. The patient has what appears to be a long history of paranoia--Dr. King is attemp"'

In [151]:
model_matrix.at[0, "sequence"]

'" ------- phenomenology of sx (and pt preference) she would very unllikely benefit from medication. best plan going forward will be to work on alliance, refer to sw in her area for support, and encourage treatment of medical issues, particularly weight loss and obesity. will also need to r o neurocognitive and other organic contributors to sx - needs a psychosis workup to rule out non-psych etiologies (p ------- ganic cause). plan: - will pursue referral to local social services - rtc in 6 weeks - will perform moca, labs, suggest imaging at that time - attg of record dr felicia smith outpatient attending note pt seen and examined with dr. king. i agree with his history, evaluation, assessment and plan in detail. the patient has what appears to be a long history of paranoia--dr. king is attemp"'

In [152]:
model_matrix.at[0, "label"]

'NTR'

In [153]:
always_patterns

Unnamed: 0,id,Pattern,Annotation,CreatedBy_id
0,2,MOCA.*22/30,Y,3
1,3,has memory difficulties,Y,3
2,4,poor.*working memory,Y,3
3,5,MOCA.*28/30,Y,3
4,6,Reports.*short.term.*memory,Y,3
5,7,IMPRESSION.*Memory loss,Y,3
6,8,cognitive deficits,Y,3
7,10,MOCA.*13/30,Y,3
8,12,No\s*memory\s*concerns,N,3
9,13,signs.*suspicious.*dementia,Y,3


In [36]:
#converting yes,no,ntr labels to numerical equivalents
mappings = {"NO" : 0, "NTR" : 1, "YES" : 2, "N": 0, "T": 1, "Y": 2}
model_matrix.label = [mappings[item] for item in model_matrix.label]

model_matrix = model_matrix.sample(frac=1).reset_index(drop=True) #shuffling model_matrix

In [130]:
model_matrix[model_matrix["sequence"].str.len() < 500]

Unnamed: 0,patient_id,sequence,original,label,sentence_id
1,0,,,,
2,0,,,,
3,0,,,,
4,0,,,,
5,0,,,,
...,...,...,...,...,...
9522,0,,,,
9523,0,,,,
9524,0,,,,
9525,0,,,,


In [20]:
# model_matrix.to_csv(r"../Modeling/Storage/Data/model_matrix.csv", index = False)

In [63]:
def find_always_pattern_matches(df, col, always_pattern_regex):
    a = []
    for seq in (df[col]):
        curr = []
        for regex_pattern in always_pattern_regex:
            match = regex_pattern.search(seq)
            if (match is not None):
                curr.append(match.group())
        a.append(curr)
    
    df["always_pattern_match"] = a
    
    return df

In [87]:
always_pattern_regex = pd.read_csv(r"Data/always_patterns_8_1.csv")
always_pattern_regex = always_pattern_regex["Pattern"].to_list()
#for i in range(len(always_pattern_regex)):
#    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)
#    model_matrix["original"].str.findall(always_pattern_regex[i])

In [66]:
model_matrix = find_always_pattern_matches(model_matrix, "original", always_pattern_regex)

In [88]:
always_pattern_regex[0]

'MOCA.*22/30'

In [120]:
# cpy = model_matrix.copy()
for i in range(len(always_pattern_regex)):
    matches = cpy["original"].str.findall(always_pattern_regex[i], re.IGNORECASE)
    matches = matches[matches.str.len() != 0]
    #print(matches.index)
    cpy = cpy.drop(list(matches.index))
    cpy.reset_index(drop = True)
print(len(cpy))

22


In [154]:
cpy.at[272, "original"]

'"operative care. She has past medical history of severe mitral regurgitation She was admitted to MGH on 6/27/16 and discharged on 7/4/16. On 6/27/16 she underwent mechanical MVR. Patient has not been readmitted to a hospital within 30 days of surgical date or within 30 days of discharge date. Today in clinic, she states she has been at rehab, but feels that she can return home. There is an Alzheimer\'s patient who has been disrupting her at night at rehab, and she cannot sleep well. She is using a walker today, but is able to walk well on her own. She has not had CP. No problems with her sternal incision. She has been lightheaded with position changes and SBP at rehab has been high 70s to low 100s. She has not had LE edema. She is on very low dose metoprolol of 6.25 mg daily, and lasi"'

In [155]:
cpy.at[272, "sequence"]

'"operative care. she has past medical history of severe mitral regurgitation she was admitted to mgh on 6 27 16 and discharged on 7 4 16. on 6 27 16 she underwent mechanical mvr. patient has not been readmitted to a hospital within 30 days of surgical date or within 30 days of discharge date. today in clinic, she states she has been at rehab, but feels that she can return home. there is an alzheimer\'s patient who has been disrupting her at night at rehab, and she cannot sleep well. she is using a walker today, but is able to walk well on her own. she has not had cp. no problems with her sternal incision. she has been lightheaded with position changes and sbp at rehab has been high 70s to low 100s. she has not had le edema. she is on very low dose metoprolol of 6.25 mg daily, and lasi"'

In [156]:
cpy.at[272, "label"]

1

In [129]:
model_matrix

Unnamed: 0,patient_id,sequence,original,label,sentence_id
0,Z12144959,""" ------- phenomenology of sx (and pt preferen...",""" ------- phenomenology of sx (and pt preferen...",NTR,id Label Annotator_id Sentence_id 0 1 ...
1,0,,,,
2,0,,,,
3,0,,,,
4,0,,,,
...,...,...,...,...,...
9522,0,,,,
9523,0,,,,
9524,0,,,,
9525,0,,,,


In [121]:
count = 0
for i in range(len(labels)):
    seq = sentences.at[labels.at[i, "Sentence_id"], "Contents"]
    if (seq not in cpy["original"].to_list()):
        count += 1

" -------  normal associations\u00fd Thought content:? appropriate to subjects discussed; no psychosis Perceptions: ?normal \u00fd Cognitive exam:? Alert and attentive with apparent good memory of recent events MOCA v7.1 22/30 (4/5 exec fxcn, 1/2 digit span, 0/1 A test, 1/3 serial 7s, 3/5 delayed recall) \u00fd\u00fd \u00fdSuici ------- ration/attention and decreased executive function. These subjective complaints are substantiated by MOCA and reveal difficulties in the frontal executive domain which may be related to chemotherapy, prior ------- sider systemic inflammatory process based on her elevated rheumatoid factor which might also impact cognition. Patient does not appear to meet full criteria for major depression at this time as her decreased engagement in activities appears to be more related to physical then"
"-------------------------------------------------------------------------------------------------------------Hi, very intense and moving visit with MS. S and her husband

In [118]:
count

20

In [68]:
hand_label = model_matrix[model_matrix["always_pattern_match"].str.len() == 0]

In [69]:
x = hand_label.index.to_list()

In [58]:
y = labels["Sentence_id"].to_list()

In [83]:
sentences.at[y[1],"Contents"]

'"-------------------------------------------------------------------------------------------------------------Hi, very intense and moving visit with MS. S and her husband. Agree with concerns re: cogn decline and considering MCI v dementia. Husb feels fxn intact but sign\'t deficits on MoCA raises ?. So thrilled they scheduled appt wit you sarah. Complex psych social dynamics here, ? etoh abuse, past trauma, ? Paranoia life long amplified by progressive cog dysfunction. Pt got very defensive about etoh and other counseling but agreed to try PT and see you. Trying to go slowly and confirmed husband has support from his PC team. Best, laura ---------------------------------------------------------------------------------------------------------"'

In [84]:
for i in tqdm(range(len(x))):
    mm = model_matrix.at[x[i], "original"]
    found_match = False
    for j in tqdm(range(len(y))):
        seq = sentences.at[y[j], "Contents"]
        if (mm == seq):
            found_match = True
            y.remove(y[j])
    if (found_match is False):
        print(mm)

  0%|                                                                                           | 0/22 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 24122.52it/s][A

 96%|███████████████████████████████████████████████████████████████████████████▋   | 23/24 [00:00<00:00, 23106.35it/s][A
  5%|███▊                                                                               | 1/22 [00:00<00:00, 55.80it/s]

"operative care. She has past medical history of severe mitral regurgitation She was admitted to MGH on 6/27/16 and discharged on 7/4/16. On 6/27/16 she underwent mechanical MVR. Patient has not been readmitted to a hospital within 30 days of surgical date or within 30 days of discharge date. Today in clinic, she states she has been at rehab, but feels that she can return home. There is an Alzheimer's patient who has been disrupting her at night at rehab, and she cannot sleep well. She is using a walker today, but is able to walk well on her own. She has not had CP. No problems with her sternal incision. She has been lightheaded with position changes and SBP at rehab has been high 70s to low 100s. She has not had LE edema. She is on very low dose metoprolol of 6.25 mg daily, and lasi"





IndexError: list index out of range

## Verification of Results

In [32]:
clean_seq = clean_sequence(str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0]))
model_matrix[model_matrix['sequence'] == clean_seq]

Unnamed: 0,patient_id,sequence,label,len
2324,Z12144959,phenomenology of sx and pt preference she woul...,1,752


In [31]:
s_id = sentences[sentences['Contents'] == str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0])]['id'].values[0]
labels[labels['Sentence_id'] == s_id]['Label'].values[0]

'NTR'

In [62]:
#exporting data to csv's
model_matrix.to_csv("../Modeling/Data/input.csv")

## EDA

In [None]:
regex_phrases = pd.read_sql_query("Select * from gui_seedregex", db)
phrase_matches = pd.read_sql_query("Select * from gui_sentenceseedregex", db)
freq = phrase_matches['SeedRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", regex_phrases[regex_phrases['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1

In [None]:
# Printing out frequencies of always pattern matches 
always_pattern_matches = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
freq = always_pattern_matches['AlwaysRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", always_patterns[always_patterns['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1