## Use psycopg2 engine to extract charts events for patient data

This script connects to the patient database 'extumate' and extracts chart events for the labelled patients identified by the field, hadm_id, in the table 'sample_vents'.

The script utilizes the pandas chunksize argument in order to avoid memory issues.

Finally, the data is stored with using `pd.DataFrame.to_feather` so it can be stored for future processing.

In [1]:
import sys
sys.path.append("../extumate")

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
import os

from extumate.config import data_dir, extumate_engine_url

#### Set user-defined variables

In [2]:
data_dir

'../data/feathered/'

In [3]:
export_name = "height"
#feather_folder = "../data/feathered/"
export_path = data_dir+export_name
export_path


'../data/feathered/height'

#### Write sql query

In [4]:
sql_query = """
SELECT 
  chartevents.*,
  sample_vents.endtime, sample_vents.re_intub_class,sample_vents.time_on_vent
FROM
  chartevents
  INNER JOIN sample_vents ON chartevents.hadm_id = sample_vents.hadm_id
WHERE
  (
    (chartevents.itemid = 226730) OR
    (chartevents.itemid = 226707)
  );
"""

#### print virtual memory available

In [5]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

3880484864


#### print size of database we're pulling from

In [6]:
os.path.getsize('../data/raw/chartevents.csv') 

29184776616

#### figure out chunk size for pandas dataframe reading

In [7]:
df_sample = pd.read_csv('../data/raw/chartevents.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

215517


In [8]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)

# Set your postgres username/password, and connection specifics

username = 'postgres'

password = 'password'    # change this

host     = 'localhost'

port     = '5432'            # default port that postgres listens on

db_name  = 'extumate'

#db_name  = 'birth_db'

In [9]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine( 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name) )
print(engine.url)

postgresql://postgres:password@localhost:5432/extumate


#### Check engine is working by checking for 'sample_vents' table

In [10]:
engine.has_table('sample_vents')

True

#### Connect using psycopg2 connection and query the database. 

Joining 'chartevents' with the 'sample_vents' table on the field hadm_id (so only pulling from patients who were ventilated), before selecting the type of event using the chartevents.itemid speeds up extraction of this data.

In [11]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = db_name, user = username, host=host,password=password)

df_result = pd.read_sql_query(sql_query,con,chunksize=my_chunk)
df_result

<generator object SQLiteDatabase._query_iterator at 0x7f9974e7bc80>

In [12]:
concat_df = pd.concat(
    [chunk
    for chunk in df_result])

In [13]:
concat_df

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning,endtime,re_intub_class,time_on_vent
0,10074556,20846673,32640247,2126-02-12 01:43:00,2126-02-12 14:30:00,226707,70,70,Inch,0,2126-02-13 10:51:00,0,19.683332
1,10004235,24181354,30276431,2196-02-24 14:39:00,2196-02-28 18:02:00,226707,72,72,Inch,0,2196-02-27 16:28:00,0,71.600000
2,10004235,24181354,30276431,2196-02-24 14:39:00,2196-02-28 18:02:00,226730,183,183,cm,0,2196-02-27 16:28:00,0,71.600000
3,10005348,25239799,31523640,2130-10-27 12:06:00,2130-10-27 16:58:00,226707,73,73,Inch,0,2130-10-28 04:35:00,0,11.833333
4,10005348,25239799,31523640,2130-10-27 12:06:00,2130-10-27 16:58:00,226730,185,185,cm,0,2130-10-28 04:35:00,0,11.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24893,19054240,21472555,38651833,2164-08-09 18:57:00,2164-08-09 21:11:00,226730,175,175,cm,0,2164-08-10 11:20:00,0,14.250000
24894,19060629,24896717,37873734,2189-01-04 20:35:00,2189-01-31 11:24:00,226707,64,64,Inch,0,2189-01-20 17:57:00,0,377.450000
24895,19060629,24896717,37873734,2189-01-04 20:35:00,2189-01-31 11:24:00,226730,162.56,162.56,cm,0,2189-01-20 17:57:00,0,377.450000
24896,19062760,22605930,30132996,2197-03-17 16:56:00,2197-03-19 10:54:00,226707,62,62,Inch,0,2197-03-17 23:17:00,0,6.283333


In [14]:
##concat_df.to_sql('pulseox', engine, if_exists='replace',chunksize=my_chunk) ### very, very slow!

#### Feather dataframe for future processing

In [15]:
concat_df.reset_index(inplace=True)

In [16]:
concat_df.to_feather(export_path)