In [None]:
#  
#  Author: Tanish Tyagi
#  

import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [None]:
db = sqlite3.connect(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\SLAT\app\db - Copy.sqlite3")

## Tables of Use

### Data to Feed into Model
1. gui_alwaysregex to get all the always patterns
2. gui_sentenceannotation to get labels
3. gui_sentence to get sentences
4. gui_sentence_always_regex to get all sentences that matched with always patterns (gives extra 1284 sentences)

### EDA Purposes
1. gui_sentencealwaysregex to see which always expressions get flagged most often
2. gui_seedregex to get list of matched phrases
2. gui_sentenceseedregex to see which expressions in general get flagged most often

## Extracting Data for Model

In [None]:
always_patterns = pd.read_sql_query("Select * from gui_alwaysregex", db)
always_patterns

In [None]:
always_patterns.to_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\EDA\Data\always_patterns_8_1.csv", index = False)

In [None]:
labels = pd.read_sql_query("Select * from gui_sentenceannotation", db)
#labels = labels.sort_values(["Sentence_id"])
labels

In [None]:
sentences = pd.read_sql_query("Select * from gui_sentence", db)
sentences

In [None]:
sentences[sentences['Contents'].str.len() < 700]

In [None]:
always_pattern_sentences = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
always_pattern_sentences

## Constructing Model Matrix

In [None]:
note = pd.read_sql_query("Select * from gui_note", db)
note

In [None]:
def clean_sequence(seq):
    #getting rid of special characters
    specials = '/' #etc
    seq_no_special_chars = seq.translate(str.maketrans(specials, ' '*len(specials)))
            
    #having only 1 space between words
    n = 1
    seq_no_spaces = (' '*n).join(seq_no_special_chars.split())
    
    return seq_no_spaces.lower()

In [None]:
#getting sentence ids
sentence_id = labels["Sentence_id"].to_list()
#print(len(sentence_id))
sentence_id += always_pattern_sentences["Sentence_id"].to_list()
#print(len(sentence_id))

# defining model matrix and setting dimensions
model_matrix = pd.DataFrame(columns = ["patient_id", "sequence", "original", "label"])
model_matrix["patient_id"] = [0] * int(len(sentence_id))
#model_matrix["note_id"] = [0] * int(len(sentence_id))
model_matrix["sequence"] = [" "] * int(len(sentence_id))
model_matrix["original"] = [" "] * int(len(sentence_id))
model_matrix["label"] = [" "] * int(len(sentence_id))
model_matrix["sentence_id"] = [" "] * int(len(sentence_id))

In [None]:
#extracting necessary data
idx = 0
for i in range(int(len(labels["Sentence_id"].to_list()))):
    #if (i < len(labels["Sentence_id"].to_list())):
    # getting all info from sentences that were manually annotated
    curr_note_id = sentences[sentences['id'] == sentence_id[i]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    curr_seq = sentences[sentences['Note_id'] == sentence_id[i]]['Contents'].values
    curr_label = labels[labels["Sentence_id"] == sentence_id[i]]['Label'].values
    original = curr_seq[0]
    curr_seq[0] = clean_sequence(curr_seq[0])
    # print(labels[labels["Sentence_id"] == sentence_id[i]]["Sentence_id"])
    model_matrix.iloc[i] = (curr_patient_id[0], curr_seq[0], original, curr_label[0], sentence_id)

    idx += 1

#print(idx) 

j = 1    
while (idx < len(sentence_id)): 
    #getting info from sentences that got auto annotated through always pattern matches
    #curr_sentence_id = always_pattern_sentences[always_pattern_sentences['id'] == j]['Sentence_id'].values
    curr_seq = sentences[sentences['id'] == sentence_id[idx]]['Contents'].values
    #print(curr_seq[0])
    curr_note_id = sentences[sentences['id'] == sentence_id[idx]]['Note_id'].values
    curr_patient_id = note.loc[note['id'] == curr_note_id[0]]['PatientID'].values
    #print(curr_patient_id)
    curr_always_regex_id = always_pattern_sentences[always_pattern_sentences['Sentence_id'] == sentence_id[idx]]['AlwaysRegex_id'].values
    #print(curr_always_regex_id)
    curr_label = always_patterns[always_patterns['id'] == curr_always_regex_id[0]]['Annotation'].values
    original = curr_seq[0]
    curr_seq[0] = clean_sequence(curr_seq[0])
    sentence_id = always_pattern_sentences[always_pattern_sentences['Sentence_id'] == sentence_id[idx]]["Sentence_id"]
    model_matrix.iloc[idx] = (curr_patient_id[0], curr_seq[0], original, curr_label[0], sentence_id)

    j += 1
    idx += 1

# yes = model_matrix[model_matrix['label'] == 3]
# neither = model_matrix[model_matrix['label'] == 2]
# no = model_matrix[model_matrix['label'] == 1]

# yes = yes.to_csv("yes.csv")
# neither = neither.to_csv("neither.csv")
# no = no.to_csv("no.csv"

In [None]:
model_matrix.at[0, "original"]

In [None]:
model_matrix.at[0, "sequence"]

In [None]:
model_matrix.at[0, "label"]

In [None]:
always_patterns

In [None]:
#converting yes,no,ntr labels to numerical equivalents
mappings = {"NO" : 0, "NTR" : 1, "YES" : 2, "N": 0, "T": 1, "Y": 2}
model_matrix.label = [mappings[item] for item in model_matrix.label]

model_matrix = model_matrix.sample(frac=1).reset_index(drop=True) #shuffling model_matrix

In [None]:
model_matrix[model_matrix["sequence"].str.len() < 500]

In [None]:
# model_matrix.to_csv(r"../Modeling/Storage/Data/model_matrix.csv", index = False)

In [None]:
def find_always_pattern_matches(df, col, always_pattern_regex):
    a = []
    for seq in (df[col]):
        curr = []
        for regex_pattern in always_pattern_regex:
            match = regex_pattern.search(seq)
            if (match is not None):
                curr.append(match.group())
        a.append(curr)
    
    df["always_pattern_match"] = a
    
    return df

In [None]:
always_pattern_regex = pd.read_csv(r"Data/always_patterns_8_1.csv")
always_pattern_regex = always_pattern_regex["Pattern"].to_list()
#for i in range(len(always_pattern_regex)):
#    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)
#    model_matrix["original"].str.findall(always_pattern_regex[i])

In [None]:
model_matrix = find_always_pattern_matches(model_matrix, "original", always_pattern_regex)

In [None]:
always_pattern_regex[0]

In [None]:
# cpy = model_matrix.copy()
for i in range(len(always_pattern_regex)):
    matches = cpy["original"].str.findall(always_pattern_regex[i], re.IGNORECASE)
    matches = matches[matches.str.len() != 0]
    #print(matches.index)
    cpy = cpy.drop(list(matches.index))
    cpy.reset_index(drop = True)
print(len(cpy))

In [None]:
cpy.at[272, "original"]

In [None]:
cpy.at[272, "sequence"]

In [None]:
cpy.at[272, "label"]

In [None]:
model_matrix

In [None]:
count = 0
for i in range(len(labels)):
    seq = sentences.at[labels.at[i, "Sentence_id"], "Contents"]
    if (seq not in cpy["original"].to_list()):
        count += 1

In [None]:
count

In [None]:
hand_label = model_matrix[model_matrix["always_pattern_match"].str.len() == 0]

In [None]:
x = hand_label.index.to_list()

In [None]:
y = labels["Sentence_id"].to_list()

In [None]:
sentences.at[y[1],"Contents"]

In [None]:
for i in tqdm(range(len(x))):
    mm = model_matrix.at[x[i], "original"]
    found_match = False
    for j in tqdm(range(len(y))):
        seq = sentences.at[y[j], "Contents"]
        if (mm == seq):
            found_match = True
            y.remove(y[j])
    if (found_match is False):
        print(mm)

## Verification of Results

In [None]:
clean_seq = clean_sequence(str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0]))
model_matrix[model_matrix['sequence'] == clean_seq]

In [None]:
s_id = sentences[sentences['Contents'] == str(sentences[sentences['Note_id'] == sentence_id[0]]['Contents'].values[0])]['id'].values[0]
labels[labels['Sentence_id'] == s_id]['Label'].values[0]

In [None]:
#exporting data to csv's
model_matrix.to_csv("../Modeling/Data/input.csv")

## EDA

In [None]:
regex_phrases = pd.read_sql_query("Select * from gui_seedregex", db)
phrase_matches = pd.read_sql_query("Select * from gui_sentenceseedregex", db)
freq = phrase_matches['SeedRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", regex_phrases[regex_phrases['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1

In [None]:
# Printing out frequencies of always pattern matches 
always_pattern_matches = pd.read_sql_query("Select * from gui_sentencealwaysregex", db)
freq = always_pattern_matches['AlwaysRegex_id'].value_counts().to_dict()
counter = 1
for key, value in freq.items():
    print("Match #", counter)
    print("Id: ", key)
    print("Pattern:", always_patterns[always_patterns['id'] == key]['Pattern'].values)
    print("Count: ", value, "\n")
    counter += 1