## Use psycopg2 engine to extract charts events for patient data

This script connects to the patient database 'extumate' and extracts chart events for the labelled patients identified by the field, hadm_id, in the table 'sample_vents'.

The script utilizes the pandas chunksize argument in order to avoid memory issues.

Finally, the data is stored with using `pd.DataFrame.to_feather` so it can be stored for future processing.

In [2]:
import sys
sys.path.append("../extumate")

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
import os

from extumate.config import data_dir, extumate_engine_url

#### Set user-defined variables

In [3]:
data_dir

'../data/feathered/'

In [4]:
export_name = "drugod"
#feather_folder = "../data/feathered/"
export_path = data_dir+export_name
export_path


'../data/feathered/drugod'

#### Write sql query (example for chart events in markdown below)

sql_query = """


SELECT 
  chartevents.*,
  
  sample_vents.endtime, sample_vents.re_intub_class,sample_vents.time_on_vent
  
FROM

  chartevents
  
  INNER JOIN sample_vents ON chartevents.hadm_id = sample_vents.hadm_id
  
WHERE

  (
    (chartevents.itemid = 224369) OR
    
    (chartevents.itemid = 224370) OR 
    
    (chartevents.itemid = 224372) OR 
    
    (chartevents.itemid = 224373) 
    
  );
  
"""

In [5]:
sql_query = """
SELECT 
  diagnoses_icd.*,
  sample_vents.endtime, sample_vents.re_intub_class,sample_vents.time_on_vent
FROM
  diagnoses_icd
  INNER JOIN sample_vents ON diagnoses_icd.hadm_id = sample_vents.hadm_id
WHERE
  (
    (diagnoses_icd.icd_code LIKE 'J84%') OR
    (diagnoses_icd.icd_code LIKE 'I27%') OR
    (diagnoses_icd.icd_code LIKE 'E84%') OR
    (diagnoses_icd.icd_code LIKE '516.31%') OR
    (diagnoses_icd.icd_code LIKE '416.0%') OR
    (diagnoses_icd.icd_code LIKE '277%') 
  );
"""

#### print virtual memory available

In [6]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

3111374848


#### print size of database we're pulling from

In [7]:
os.path.getsize('../data/raw/diagnoses_icd.csv') 

144134230

#### figure out chunk size for pandas dataframe reading

In [7]:
df_sample = pd.read_csv('../data/raw/diagnoses_icd.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

378787


In [8]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)

# Set your postgres username/password, and connection specifics

username = 'postgres'

password = 'password'    # change this

host     = 'localhost'

port     = '5432'            # default port that postgres listens on

db_name  = 'extumate'

#db_name  = 'birth_db'

In [1]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
#engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
engine = create_engine( extumate_engine_url.format(username, password, host, port, db_name) )
print(engine.url)

NameError: name 'create_engine' is not defined

#### Check engine is working by checking for 'sample_vents' table

In [10]:
engine.has_table('sample_vents')

True

#### Connect using psycopg2 connection and query the database. 

Joining 'chartevents' with the 'sample_vents' table on the field hadm_id (so only pulling from patients who were ventilated), before selecting the type of event using the chartevents.itemid speeds up extraction of this data.

In [11]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = db_name, user = username, host=host,password=password)

df_result = pd.read_sql_query(sql_query,con,chunksize=my_chunk)
df_result

<generator object SQLiteDatabase._query_iterator at 0x7f19fe36f890>

In [12]:
concat_df = pd.concat(
    [chunk
    for chunk in df_result])

In [13]:
concat_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,endtime,re_intub_class,time_on_vent
0,14172652,22126865,11,E8497,9,2171-01-11 08:15:00,0,13.250000
1,17169563,21748215,10,E8499,9,2172-10-10 14:57:00,0,201.216660
2,11642909,23341616,6,E8497,9,2122-11-05 09:00:00,0,79.966670
3,16513166,28957782,22,E8497,9,2185-02-10 18:17:00,1,258.800000
4,16998152,26661707,11,E8497,9,2146-09-19 08:02:00,0,38.116665
...,...,...,...,...,...,...,...,...
2116,14029699,21629146,16,J8489,10,2143-01-07 11:58:00,0,162.633330
2117,17427545,22473775,8,I2720,10,2126-05-22 14:00:00,0,71.916664
2118,10302157,22665336,19,I2720,10,2189-06-28 17:26:00,0,46.750000
2119,13840732,26949782,27,I2720,10,2147-01-18 15:30:00,0,328.650000


In [14]:
##concat_df.to_sql('pulseox', engine, if_exists='replace',chunksize=my_chunk) ### very, very slow!

In [15]:
concat_df[concat_df['re_intub_class']==1]

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,endtime,re_intub_class,time_on_vent
3,16513166,28957782,22,E8497,9,2185-02-10 18:17:00,1,258.800000
6,17620129,26712888,15,E8497,9,2168-09-30 06:44:00,1,16.183332
7,17620129,26712888,11,E8498,9,2168-09-30 06:44:00,1,16.183332
11,18656560,28807703,16,E8497,9,2156-06-17 09:45:00,1,200.500000
22,14411859,24976204,26,E8497,9,2163-04-02 13:27:00,1,2254.450000
...,...,...,...,...,...,...,...,...
2059,19673450,26742984,9,I2721,10,2145-03-28 09:22:00,1,34.616665
2067,16123073,27484036,6,E849,10,2159-08-25 12:35:00,1,149.883330
2085,14340432,22952897,25,I2729,10,2119-06-02 22:21:00,1,48.850000
2097,12190919,27923841,26,I2720,10,2121-11-13 15:18:00,1,13.750000


#### Feather dataframe for future processing

In [16]:
concat_df=concat_df[['hadm_id','seq_num','icd_code']]

In [17]:
#concat_df.reset_index(inplace=True)
#concat_df.drop('index',axis = 1,inplace=True)

In [18]:
concat_df.to_feather(export_path)

In [19]:
concat_df

Unnamed: 0,hadm_id,seq_num,icd_code
0,22126865,11,E8497
1,21748215,10,E8499
2,23341616,6,E8497
3,28957782,22,E8497
4,26661707,11,E8497
...,...,...,...
2116,21629146,16,J8489
2117,22473775,8,I2720
2118,22665336,19,I2720
2119,26949782,27,I2720
