In [None]:
## NOTE: before running the code make sure you turn on avante instance.

In [None]:
## select the below values as per you experiment.
client = 'avante'
start_date = '2021-01-01'
end_date= '2021-11-23'

progressnote_types = ['* Skilled Nursing Note','X Social Service Interview',
'* Incident/Accident Note','Avante Daily Skilled Note',
'* AVANTE Admission Plan of Care','Alert Note','* Behavior','MDS Note'
'eMar - Shift Level Administration Note','* Social Services Admission / Readmission',
'* Family/MPOA/Responsible Party Contact','* Activity Note',
'Speech Therapy Screen','* Education(Family/Resident)','* Dietary RD/DTR Progress Note',
'* Dietary CDM Progress Note','* Social Services Note']

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)
import json
import os
import boto3
from eliot import log_message
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy import text

#####  add any preceding or following negation word here.
##### example to remove history of yelling, add 'history in preceding negations

In [None]:
from negspacy.negation import Negex
from negspacy.termsets import en_clinical
import re
import string
preceding_negations = en_clinical['preceding_negations']
following_negations = en_clinical['following_negations']
custom_preceding_negations = ['0','assessment','consent','decrease','decreased','decreasing','deny','family',
                              'mild','mildly','minimal','n/c','neg','negative','neither','no s/s','no sign',
                              'no signs','nor','perform','precaution','precautions','prevent','prevention','history',
                              'quarantine','recovered','regulations','restriction','restrictions','retesting',
                              'screen','slight','swabbed','vac','w/o sign','w/o signs','w/o','worsening','zero','free from sx/sx']

custom_following_negations = ['assessment','care','cares','consent','crisis','decrease','decreased','decreasing','family',
                              'guidelines','mild','mildly','minimal','neg','negative','none','note','outbreak',
                              'pandemic','perform','precaution','precautions','prevent','prevention','protocol',
                              'quarantine','recovered','regulations','restriction','restrictions','retesting',
                              'screen','swabbed','test','testing','vac'
                             ]
preceding_negations += custom_preceding_negations
following_negations += custom_following_negations

##### connecting with database.

In [None]:
class DbEngine(object):
    """
    Fetch the credentials from AWS Secrets Manager.
    :return: DB connection to the respective database
    """

    def __init__(self, region_name='us-east-1'):
        self.session = boto3.session.Session()
        self.secrets_client = self.session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

    def get_secrets(self, secret_name):
        """
        :return: Based on the environment get secrets for
        Client SQL db & Postgres Saivadb
        """
        log_message(message_type='info', action_type='get_secrets', secret_name=secret_name)
        db_info = json.loads(
            self.secrets_client.get_secret_value(SecretId=secret_name)[
                'SecretString'
            ]
        )
        return db_info

    def get_postgresdb_engine(self):
        """
        Based on the environment connects to the respective database
        :param client: client name
        :return: Saivadb Postgres engine
        """
        log_message(message_type='info', action_type='connect_to_postgresdb', client='SaivaDB')
        # Fetch credentials from AWS Secrets Manager
        postgresdb_info = self.get_secrets(secret_name=f'prod-saivadb')
        # Create DB URL
        saivadb_url = URL(
            drivername='postgresql',
            username=postgresdb_info['username'],
            password=postgresdb_info['password'],
            host=postgresdb_info['host'],
            port=postgresdb_info['port'],
            database=postgresdb_info['dbname'],
        )
        # Return Postgres Engine
        return create_engine(saivadb_url, echo=False)
    
    def get_sqldb_engine(self, clientdb_name):
        """
        Based on the environment connects to the respective database.
        Avante db is in client VPN hence we use different credentials.
        :param client: client name
        :return: Client SQL engine
        """
        log_message(message_type='info', action_type='connect_to_sqldb', client=clientdb_name)
        # Fetch credentials from AWS Secrets Manager
        if clientdb_name == 'avante':
            sqldb_info = self.get_secrets(secret_name=f'avantedb')
        else:
            sqldb_info = self.get_secrets(secret_name=f'prod-sqlserver')
            sqldb_info['dbname'] = clientdb_name

        # Create DB URL
        client_sqldb_url = URL(
            drivername='mssql+pyodbc',
            username=sqldb_info['username'],
            password=sqldb_info['password'],
            host=sqldb_info['host'],
            port=sqldb_info['port'],
            database=sqldb_info['dbname'],
            query={'driver': 'ODBC Driver 17 for SQL Server'},
        )
        # Return Sql Engine
        return create_engine(client_sqldb_url, echo=False)
    
    def verify_connectivity(self, engine):
        assert engine.execute('select 1').fetchall() is not None  # verify connectivity



In [None]:
sql_query = f"""
select * from view_ods_progress_note
where createddate between '{start_date}' and '{end_date}'
"""

* reading data and joining the progress notes in correct order.
* keeping selected progressnote types only

In [None]:
engine = DbEngine()
client_engine =  engine.get_sqldb_engine('avante')
df =pd.read_sql(sql_query, con = client_engine)
df.columns = df.columns.str.lower()
df = df[~df['notetext'].isna()]
df = df[df['progressnotetype'].isin(progressnote_types)]

df.sort_values(by=['facilityid', 'patientid', 'createddate', 'progressnoteid', 'progressnotetype',
                                     'section', 'sectionsequence', 'notetextorder'], inplace=True)

grp_cols = ['facilityid', 'patientid', 'createddate', 'progressnoteid', 'progressnotetype', 'section']
grouped_df = (df.groupby(grp_cols).agg({'notetext': lambda note_parts: ''.join(
    note_parts)}).reset_index())

grouped_df.head()

In [None]:
#amount of data
grouped_df.shape

In [None]:
#keeping selected columns
grouped_df = grouped_df[['facilityid','patientid','createddate','progressnotetype','notetext']]
grouped_df.sort_values(by=['createddate'],inplace=True)

In [None]:
# making patterns out of keywords.
from keywords import KEYWORDS_LIST
KEYWORDS_LIST = [keyword.replace('\n','') for keyword in KEYWORDS_LIST]
           
pattern_label_list = []
for word in KEYWORDS_LIST:
    pattern_list = []
    pattern_words = re.findall(f"[\w']+|[{string.punctuation}]", word)
    for sub_pattern in pattern_words:
        pattern_list.append({'LOWER': sub_pattern})
    pattern_label_list.append({'label': 'ENTITY', 'pattern':pattern_list})
print(pattern_label_list)

In [None]:
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
nlp = English()

In [None]:
sentencizer = nlp.add_pipe('sentencizer')
sentencizer.punct_chars.union({'\n'})

# Add EntityRuler - adding labels and patterns
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(pattern_label_list)

In [None]:
# though negspacy covers almost every negative word but still adding some negative words.
negation_words = ["Zero","Deny", "Denies","Denied","No", "Not", "None", "No one", "Nobody", "Nothing", "Neither",
                  "Nowhere", "Never", "Hardly", "Scarcely", "Barely", "Doesn't", "Isn't", "Wasn't", "Shouldn't",
                  "Wouldn't", "Couldn't", "Won't", "Can't", "Don't", "0", "quarantine","no","history"]
negation_words += custom_preceding_negations + custom_following_negations

nlp.add_pipe(
            "negex", config={
              "chunk_prefix": negation_words+preceding_negations+following_negations,
            }
            
            )

In [None]:
# adds span tags around the keywords.

def add_tags(clean_note, pos_index_list):
        """
        :param clean_note:
        :param pos_index_list:
        :return:
        this function adds html tags using the indexes from pos_index_list in the reverse order.
        """
        END_TAG = '</span>'
        START_TAG = "<span class='yellow'>"
        for index in pos_index_list[::-1]:
            if index[0] != None:
                clean_note = clean_note[:index[1]] + END_TAG + clean_note[index[1]:]
                clean_note = clean_note[:index[0]] + START_TAG + clean_note[index[0]:]
        return clean_note

In [None]:
# cleaning the text.
# extracting keywords with positive occurence.
# removing words that occur is both positive and negative form.
#removing duplicate words.
# keeping texts which have span tags in them.
def extract_entities(row):
    
    row['clean_note'] = row['notetext']
    row['clean_note'] = row['clean_note'].replace('  ', ' ')
    doc = nlp(row['clean_note'])

    neg_list = list()
    pos_list, pos_index_list = list(), list()
    for word in doc.ents:
        # segregating positive and negative words.
        if not word._.negex or word.text in 'non-compliant':
            # populating positive and respective positive index list.
            pos_list.append(word.text.lower())
            pos_index_list.append((word.start_char, word.end_char))
        else:
            neg_list.append(word.text)
    # neutral words are words present in both negative and positive form.
    # patient tested for edema. No edema found. -> edema is neutral word.
    # neutral words are removed from positive list and its respective index list
    neutral_words = list(set(pos_list).intersection(set(neg_list)))
    if neutral_words:
        # removing the neutral words from positive list and index in reverse order.
        for neutral_word in neutral_words[::-1]:
            neutral_word_index = pos_list.index(neutral_word)
            del pos_list[neutral_word_index]
            del pos_index_list[neutral_word_index]


    if len(pos_index_list):

        filtered_pos_index_list = []
        filtered_pos_list = []
        for ind in range(len(pos_list)):
            if pos_list[ind] not in filtered_pos_list:
                filtered_pos_list.append(pos_list[ind])
                filtered_pos_index_list.append(pos_index_list[ind])
        row['index_list'] = filtered_pos_index_list
        row['word_list'] = filtered_pos_list
        row['clean_note'] = add_tags(row['clean_note'], filtered_pos_index_list)
        doc = nlp(row['clean_note'])
        row['clipped_note'] = ''
        for sent in doc.sents:
            tagged_sent = add_tags(sent.text, filtered_pos_index_list)
            if '<span' in sent.text or 'span>' in sent.text:
                row['clipped_note'] +=sent.text
        print(row['clipped_note'])
        print(filtered_pos_list)
        print('====================')
    return row

grouped_df = grouped_df.apply(extract_entities, axis=1)

In [None]:
grouped_df = grouped_df[~grouped_df['index_list'].isna()]
del grouped_df['clean_note']
grouped_df.head(10)

In [None]:
grouped_df.to_csv('highrisk_patients.csv',index=False)