In [1]:
import sys
import os
import pandas as pd
sys.path.insert(0, '/src')
from eliot import to_file

import json

import boto3
from eliot import log_message
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

import numpy as np
from s3fs import S3FileSystem
to_file(sys.stdout)

S3FS = S3FileSystem()

In [2]:

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


In [3]:

CLIENT = 'trio'
TRAIN_START_DATE = '2021-01-15'
TEST_END_DATE = '2021-10-15'
FACILITYIDS = (1, 3, 4, 5, 7, 13, 16, 19, 20, 21, 23, 26, 29, 31, 32, 33, 36)
ENABLE_ALL_FACILITIES = "true"
env = os.environ.get('SAIVA_ENV','dev')
date_range = f'{TRAIN_START_DATE} to {TEST_END_DATE}'

print(CLIENT)
print(env)
print(TRAIN_START_DATE, TEST_END_DATE)
print(FACILITYIDS)

trio
dev
2021-01-15 2021-10-15
(1, 3, 4, 5, 7, 13, 16, 19, 20, 21, 23, 26, 29, 31, 32, 33, 36)


In [4]:
class DbEngine(object):
    """
    Fetch the credentials from AWS Secrets Manager.
    :return: DB connection to the respective database
    """

    def __init__(self, region_name='us-east-1'):
        self.session = boto3.session.Session()
        self.secrets_client = self.session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

    def get_secrets(self, secret_name):
        """
        :return: Based on the environment get secrets for
        Client SQL db & Postgres Saivadb
        """
        log_message(message_type='info', action_type='get_secrets', secret_name=secret_name)
        db_info = json.loads(
            self.secrets_client.get_secret_value(SecretId=secret_name)[
                'SecretString'
            ]
        )
        return db_info

    def get_postgresdb_engine(self):
        """
        Based on the environment connects to the respective database
        :param client: client name
        :return: Saivadb Postgres engine
        """
        log_message(message_type='info', action_type='connect_to_postgresdb', client='SaivaDB')
        # Fetch credentials from AWS Secrets Manager
        postgresdb_info = self.get_secrets(secret_name=f'{env}-saivadb')
        # Create DB URL
        saivadb_url = URL(
            drivername='postgresql',
            username=postgresdb_info['username'],
            password=postgresdb_info['password'],
            host=postgresdb_info['host'],
            port=postgresdb_info['port'],
            database=postgresdb_info['dbname'],
        )
        # Return Postgres Engine
        return create_engine(saivadb_url, echo=False)
    
    def get_sqldb_engine(self, clientdb_name):
        """
        Based on the environment connects to the respective database.
        Avante db is in client VPN hence we use different credentials.
        :param client: client name
        :return: Client SQL engine
        """
        log_message(message_type='info', action_type='connect_to_sqldb', client=clientdb_name)
        # Fetch credentials from AWS Secrets Manager
        if clientdb_name == 'avante':
            sqldb_info = self.get_secrets(secret_name=f'avantedb')
        else:
            sqldb_info = self.get_secrets(secret_name=f'{env}-sqlserver')
            sqldb_info['dbname'] = clientdb_name

        # Create DB URL
        client_sqldb_url = URL(
            drivername='mssql+pyodbc',
            username=sqldb_info['username'],
            password=sqldb_info['password'],
            host=sqldb_info['host'],
            port=sqldb_info['port'],
            database=sqldb_info['dbname'],
            query={'driver': 'ODBC Driver 17 for SQL Server'},
        )
        # Return Sql Engine
        return create_engine(client_sqldb_url, echo=False)
    
    def verify_connectivity(self, engine):
        assert engine.execute('select 1').fetchall() is not None  # verify connectivity



In [5]:
engine = DbEngine()
saiva_engine = engine.get_postgresdb_engine()
pcc_db_engine = engine.get_sqldb_engine(clientdb_name='avante')
scrape_db_engine = engine.get_sqldb_engine(clientdb_name='avante_scrape')

{"action_type": "connect_to_postgresdb", "client": "SaivaDB", "timestamp": 1647313227.5807683, "task_uuid": "9ab4a311-5dee-434d-899f-aeb1f34004c7", "task_level": [1], "message_type": "info"}
{"action_type": "get_secrets", "secret_name": "dev-saivadb", "timestamp": 1647313227.581505, "task_uuid": "460fe05f-d65d-4b70-9c0e-b3f6cff2fea5", "task_level": [1], "message_type": "info"}
{"action_type": "connect_to_sqldb", "client": "avante", "timestamp": 1647313227.7864904, "task_uuid": "aee25189-0a48-4e16-bacf-3adf7bdf3e62", "task_level": [1], "message_type": "info"}
{"action_type": "get_secrets", "secret_name": "avantedb", "timestamp": 1647313227.7871437, "task_uuid": "c3fa1e87-3fd5-4e65-b0dd-93550241d494", "task_level": [1], "message_type": "info"}
{"action_type": "connect_to_sqldb", "client": "avante_scrape", "timestamp": 1647313227.856264, "task_uuid": "461ecfe3-57b0-4e56-8ec1-6bb383aafc59", "task_level": [1], "message_type": "info"}
{"action_type": "get_secrets", "secret_name": "dev-sqlser

In [None]:
query = f"""select clientid as patientid
                        from view_ods_daily_census_v2 where censusdate = '2022-02-24'
                        and facilityid = 21                   
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.shape

In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.shape

In [None]:
df = pd.merge(df1, df2, how='inner', on=['patientid'])
patientids = tuple(df['patientid'].tolist())

In [9]:
pd.concat([df1,df2]).drop_duplicates(keep=False)

Unnamed: 0,patientid
38,4406446
73,4405564
14,1894244
31,3958121
98,388833


### Compare Admissions

In [None]:
query = f"""SELECT PatientId, DateOfAdmission from view_ods_hospital_transfers_admission_log 
where DateOfAdmission BETWEEN '2021-01-01' and '2022-02-14' and FacilityID=21
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.shape

In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.shape

In [None]:
pd.concat([df1,df2]).drop_duplicates(keep=False)

### Compare Transfers

In [None]:
query = f"""SELECT PatientId, DateOfTransfer from view_ods_hospital_transfers_transfer_log_v2 
where DateOfTransfer BETWEEN '2021-01-01' and '2022-02-14' and FacilityID=21
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.shape

In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.shape

In [None]:
pd.concat([df1,df2]).drop_duplicates(keep=False)

### Compare Diagnoses

In [None]:
query = f"""
select  onsetdate, diagnosiscode,resolveddate, patientid
                        from view_ods_patient_diagnosis where patientid in {patientids}
                        and (StruckOut is NULL or StruckOut = 'N') 
                        and (Deleted is NULL or Deleted = 'N')
						and OnSetDate between '2021-01-01' and '2022-02-14'
                        order by diagnosiscode desc"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.shape

In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.shape

In [None]:
sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.sort_values(ascending=False, by=['patientid', 'onsetdate']))
print('==============================')
print(sym_diff.shape)

### Compare Vitals

In [None]:
query = f"""
select cast(date As Date), vitalsdescription, value, diastolicvalue, clientid
                            from view_ods_Patient_weights_vitals
                            where clientid IN {patientids}
                            and date between '2021-01-01' and '2022-02-14'
                            order by date, vitalsdescription
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.drop_duplicates(inplace=True)
df1.shape

In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.drop_duplicates(inplace=True)
df2.shape

In [None]:
sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.shape)
print('==============================')
print(sym_diff.groupby(['clientid']).size().sort_values(ascending=False))

### Compare lab_results

In [None]:
query = f"""
with latest_labs as (
select c.patientid, c.FacilityID, b.MasterLabReportID, a.ProfileDescription, max(b.VersionNumber) as VersionNumber
						from view_ods_result_lab_report_detail a
                        left join view_ods_result_lab_report b on a.LabReportID = b.LabReportID
                        left join view_ods_result_order_source c on b.ResultOrderSourceID = c.ResultOrderSourceID
                        WHERE a.resultdate BETWEEN '2021-01-01' and '2022-02-14' 
                        and c.patientid IN {patientids} and c.FacilityID = 21
						group by c.patientid, c.FacilityID,  b.MasterLabReportID, a.profiledescription
)
select distinct c.patientid , c.ResultOrderSourceID as ResultOrderSourceID, a.ProfileDescription,
                        a.result
						from view_ods_result_lab_report_detail a
                        left join view_ods_result_lab_report b on a.LabReportID = b.LabReportID
                        left join view_ods_result_order_source c on b.ResultOrderSourceID = c.ResultOrderSourceID
						inner join latest_labs ll 
						on b.MasterLabReportID = ll.MasterLabReportID
						and c.PatientID = ll.PatientID
						and c.FacilityID = ll.FacilityID
						and a.ProfileDescription = ll.ProfileDescription
						and b.VersionNumber = ll.VersionNumber
                
                        WHERE a.resultdate BETWEEN '2021-01-01' and '2022-02-14' 
                        AND c.facilityid =21
						and c.patientid IN {patientids} order by c.patientid, c.ResultOrderSourceID
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.drop_duplicates(inplace=True)
df1.shape

In [None]:
query2 = f"""
select distinct b.patientid , b.ResultOrderSourceId as ResultOrderSourceID, b.ProfileDescription, 
                        b.result
                        from view_ods_result_lab_report b 
                        WHERE b.ReportedDate BETWEEN '2021-01-01' and '2022-02-14' 
                        AND b.facilityid =21
						and b.patientid IN {patientids}
                        order by b.patientid, b.ResultOrderSourceId
"""
df2 = pd.read_sql(query2, con=scrape_db_engine)
df2.drop_duplicates(inplace=True)
df2.shape

In [None]:
sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.shape)
print(sym_diff.groupby(['patientid']).size().sort_values(ascending=False))

### Compare orders

In [18]:
query = f"""
select distinct patientid, PhysicianOrderID, cast(orderdate as date)
                        from view_ods_physician_order_list_v2
                        where patientid in {patientids}
                        and ordercategory in ('Diagnostic', 'Enteral - Feeding', 'Dietary - Diet', 'Dietary - Supplements','Laboratory')
                        and orderdate BETWEEN '2021-01-18' and '2022-02-14'
                        order by patientid, PhysicianOrderID
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.drop_duplicates(inplace=True)
df1.shape

(2032, 3)

In [19]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.drop_duplicates(inplace=True)
df2.shape

(2025, 3)

In [20]:

sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.shape)
print(sym_diff.groupby(['patientid']).size().sort_values(ascending=False))

(63, 3)
patientid
365093     6
1151885    6
4193968    5
1026758    4
4231437    4
4225640    4
3227256    4
4264745    3
4060852    3
4349529    3
3076407    2
520036     2
547386     2
727292     2
208929     2
1097672    2
1201988    2
1411594    2
3704806    2
4157630    2
4352007    1
dtype: int64


In [17]:
sym_diff[sym_diff.patientid==316763]

Unnamed: 0,patientid,PhysicianOrderID,Unnamed: 3
152,316763,7546512,2021-01-05
153,316763,7546587,2021-01-06
154,316763,7548141,2021-01-06
155,316763,7548200,2021-01-06
156,316763,7548263,2021-01-06
157,316763,7548273,2021-01-06
158,316763,7552908,2021-01-07
159,316763,7552967,2021-01-07
160,316763,7553075,2021-01-07
161,316763,7560774,2021-01-11


### Compare meds

In [25]:
query = f"""
select distinct patientid, gpiclass, 
                        gpiclassdescription, gpisubclassdescription, orderdescription
                        from view_ods_physician_order_list_v2 a inner join view_ods_physician_order_list_med b
                        on a.PhysicianOrderID = b.PhysiciansOrderID
                        where patientid in {patientids}
                        and facilityid = 21
                        and orderdate BETWEEN '2021-01-01' and '2022-02-14'
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.drop_duplicates(inplace=True)
df1.shape

(5373, 5)

In [26]:
query2 = f"""
select distinct patientid, gpiclass, 
                        gpiclassdescription, gpisubclassdescription, orderdescription
                        from view_ods_physician_order_list_v2 a inner join view_ods_physician_order_list_med b
                        on a.med_id = b.med_id
                        where patientid in {patientids}
                        and facilityid = 21
                        and orderdate BETWEEN '2021-01-01' and '2022-02-14'
"""
df2 = pd.read_sql(query2, con=scrape_db_engine)
df2.drop_duplicates(inplace=True)
df2.shape

(3185, 5)

In [27]:

sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.shape)
print(sym_diff.groupby(['patientid']).size().sort_values(ascending=False))

(2298, 5)
patientid
727292     86
3704806    85
1695224    77
316763     63
2818652    62
2393602    61
1072404    59
617267     54
560461     52
1576667    50
540452     48
3717121    48
2530658    48
2382771    47
1945345    46
3227256    46
1201988    46
3880691    45
554672     44
1097672    41
298159     38
2500968    36
539682     35
1811540    35
531685     32
1151885    31
3081878    30
547386     28
1759245    27
3235517    27
774508     27
494603     26
520036     26
304612     25
3234439    24
1007676    24
2434264    24
545123     23
3262877    23
3950272    23
480595     22
732111     22
514522     22
3518224    21
3766590    20
1102194    20
304737     19
3076407    19
1411594    19
2577343    19
853235     19
1065735    17
219059     17
550493     15
4291183    14
4349529    14
435727     14
541313     14
1101643    14
1026758    13
510316     13
1251202    13
4352835    13
684844     12
3719958    12
208929     12
4231437    11
365093     11
3970907    11
4360824    11


### Compare Progress Notes

In [None]:
query = f"""
select distinct patientid, progressnoteid from view_ods_progress_note
                        where patientid in {patientids} and
                        createddate BETWEEN '2021-01-01' and '2022-02-14'
"""
df1 = pd.read_sql(query, con=pcc_db_engine)
df1.drop_duplicates(inplace=True)
df1.shape


In [None]:
df2 = pd.read_sql(query, con=scrape_db_engine)
df2.drop_duplicates(inplace=True)
df2.shape

In [None]:

sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
print(sym_diff.shape)

## Compare reports generated from scrape and pcc data relay 

In [None]:
!pip install fuzzywuzzy==0.18.0
!pip install python-Levenshtein==0.12.0

In [None]:
from fuzzywuzzy import fuzz

In [None]:
pcc_path = f's3://saiva-dev-data-bucket/data/avante/2022-02-16/21/risk_report_2022-02-16.json'
scrape_path = f's3://saiva-dev-data-bucket/data/avante_scrape/2022-02-16/21/risk_report_2022-02-16.json'
with S3FS.open(pcc_path, 'rb') as f:
     pcc_report = json.load(f)
with S3FS.open(scrape_path, 'rb') as f:
     scrape_report = json.load(f)

In [None]:
pcc_report, scrape_report

In [None]:
pcc_reported = [x['masterpatientid'] for x in pcc_report['patient_list']]
scrape_reported = [x['masterpatientid'] for x in scrape_report['patient_list']]
common_reported = list(set(pcc_reported) & set(scrape_reported))
len(common_reported)

In [None]:
def intersection(lst1, lst2):
    matches = 0
    for x in lst1:
        found_match = False
        for y in lst2:
            if fuzz.ratio(x, y) >= 90:
                matches += 1
                break
    return matches

In [None]:
common_highlights = 0
total_highlights = 0

common_vitals = 0
total_vitals = 0

common_conditions = 0
total_conditions = 0

common_diagnoses = 0
total_diagnoses = 0

common_meds = 0
total_meds = 0

common_orders = 0
total_orders = 0

for m_id in common_reported:
    pcc_r = next(x for x in pcc_report['patient_detail_list'] if x['masterpatientid'] == m_id)
    scrape_r = next(x for x in scrape_report['patient_detail_list'] if x['masterpatientid'] == m_id)
    pcc_r_highlights = [x.replace('\r', '') for x in pcc_r['highlights']]
    scrape_r_highlights = [x.replace('\r', '') for x in scrape_r['highlights']]
        
    total_highlights += len(pcc_r_highlights) + len(scrape_r_highlights)
    common_highlights += intersection(pcc_r_highlights, scrape_r_highlights) * 2
    
    pcc_r_vitals = [x['value'] for x in pcc_r['vitals']]
    scrape_r_vitals = [x['value'] for x in scrape_r['vitals']]

    total_vitals += len(pcc_r_vitals) + len(scrape_r_vitals)
    common_vitals += (len(set(pcc_r_vitals) & set(scrape_r_vitals)))*2
    
    pcc_r_conditions = [x['name'] for x in pcc_r['conditions']]
    scrape_r_conditions = [x['name'] for x in scrape_r['conditions']]

    total_conditions += len(pcc_r_conditions) + len(scrape_r_conditions)
    common_conditions += (len(set(pcc_r_conditions) & set(scrape_r_conditions)))*2
    
    pcc_r_diagnoses = [x['name'] for x in pcc_r['diagnoses']]
    scrape_r_diagnoses = [x['name'] for x in scrape_r['diagnoses']]

    total_diagnoses += len(pcc_r_diagnoses) + len(scrape_r_diagnoses)
    common_diagnoses += (len(set(pcc_r_diagnoses) & set(scrape_r_diagnoses)))*2
    
    
    pcc_r_meds = [x['name'] for x in pcc_r['meds']]
    scrape_r_meds = [x['name'] for x in scrape_r['meds']]

    total_meds += len(pcc_r_meds) + len(scrape_r_meds)
    common_meds += (len(set(pcc_r_meds) & set(scrape_r_meds)))*2
    
    pcc_r_orders = [x['name'] for x in pcc_r['orders']]
    scrape_r_orders = [x['name'] for x in scrape_r['orders']]

    total_orders += len(pcc_r_orders) + len(scrape_r_orders)
    common_orders += intersection(pcc_r_orders, scrape_r_orders) * 2
    
    
    
print("{0:.0%}".format(common_highlights/total_highlights))
print("{0:.0%}".format(common_vitals/total_vitals))
print("{0:.0%}".format(common_conditions/total_conditions))
print("{0:.0%}".format(common_diagnoses/total_diagnoses))
print("{0:.0%}".format(common_meds/total_meds))
print("{0:.0%}".format(common_orders/total_orders))




In [None]:
query1 = f"""select censusdate, clientid 
            from view_ods_daily_census_v2 
            where censusdate between '2021-01-01' and '2022-02-18'
            and facilityid = 42
            and censusactioncode not in ('DAMA', 'DD', 'DE', 'DH', 'E', 'HU', 'L', 'LV', 'MO', 'TO', 'TP', 'TU')
"""
query2 = f"""select censusdate, patientid as clientid 
            from view_ods_daily_census_v2 
            where censusdate between '2021-01-01' and '2022-02-18'
            and facilityid = 42
            and censusactioncode not in ('DAMA', 'DD', 'DE', 'DH', 'E', 'HU', 'L', 'LV', 'MO', 'TO', 'TP', 'TU')
"""
df1 = pd.read_sql(query1, con=scrape_db_engine)
df2 = pd.read_sql(query2, con=pcc_db_engine)
df1.shape, df2.shape

In [None]:
sym_diff = pd.concat([df1,df2]).drop_duplicates(keep=False)
diff_series = sym_diff.groupby(['censusdate']).size()
print(diff_series)
print(diff_series.size)
print(sym_diff.size)

In [None]:
sym_diff[sym_diff.censusdate == '2021-09-02']