In [13]:
import boto3
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import json
from pathlib import Path

In [11]:
secret_name = "cust_db_credentials"
region_name = "us-east-1"

# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(service_name="secretsmanager", region_name=region_name)

get_secret_value_response = client.get_secret_value(SecretId=secret_name)

db_info = json.loads(get_secret_value_response["SecretString"])

avan_connect_url = URL(
    drivername="mssql+pyodbc",
    username=db_info["username"],
    password=db_info["password"],
    host=db_info["host"],
    database="AVAN",
    query={'driver': 'ODBC Driver 17 for SQL Server'}
)
'D1'
avan_engine = create_engine(avan_connect_url)

In [12]:
query = f'''
select progressnoteid, patientid, facilityid, progressnotetype, createddate, sectionsequence, section, notetextorder, notetext
from view_ods_progress_note
where createddate between '2018-01-01' and '2018-12-31'
order by createddate, progressnoteid, notetextorder
'''

patient_progress_notes = pd.read_sql(query, con=avan_engine)
# patient_progress_notes = patient_progress_notes.merge(master_patient_lookup, on=['patientid','facilityid'])

In [14]:
data_path = Path('/code/data/raw')
data_path.mkdir(parents=True, exist_ok=True)

In [15]:
patient_progress_notes.to_parquet(data_path/'avante_patient_progress_notes_2018.parquet')
print('Note data saved.')

Note data saved.


In [20]:
(sum(patient_progress_notes['notetext'].apply(len))/100 -50000) * 0.0001

350.882255

In [23]:
def join_with_space(x):
    return ' '.join(x)

In [84]:
grp_columns = 'progressnoteid patientid facilityid progressnotetype createddate sectionsequence section'.split()

In [93]:
full_notes = pd.DataFrame(patient_progress_notes.sort_values(['progressnoteid', 'sectionsequence', 'notetextorder']).groupby(grp_columns)['notetext'].agg(join_with_space))

In [101]:
comprehend_client = boto3.client(service_name='comprehendmedical', region_name='us-east-1')

In [106]:
def comprehend_medical_parse(note_text):
    result = comprehend_client.detect_entities(Text=note_text)
    return json.dumps(result)

In [100]:
full_notes_sample = full_notes.sample(10)

In [107]:
full_notes_sample['comprehend_result'] = full_notes_sample['notetext'].apply(comprehend_medical_parse)

In [108]:
full_notes_sample.to_parquet(data_path/'full_notes_sample.parquet')

In [109]:
reload_sample = pd.read_parquet(data_path/'full_notes_sample.parquet')