In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/MLA4H/project/notebook

/content/drive/My Drive/MLA4H/project_final/notebook


In [6]:
# Import libraries
import os
import pandas_gbq
import pandas as pd

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# authenticate
auth.authenticate_user()

In [8]:
# Set up environment variables
project_id = 'mimic-433109'

os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
    return pandas_gbq.read_gbq(
        query,
        project_id=project_id,
        dialect='standard'
    )

dataset = 'mimic_demo'

# 1. Check itemid

In [9]:
%%time

query = """
SELECT *
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE itemid in (51106, 51082, 52000, 51077, 50965)
"""

run_query(query).tail(20)

Downloading: 100%|[32m██████████[0m|
CPU times: user 21.8 ms, sys: 4.17 ms, total: 26 ms
Wall time: 743 ms


Unnamed: 0,itemid,label,fluid,category
0,50965,Parathyroid Hormone,Blood,Chemistry
1,51077,"Calcium, Urine",Urine,Chemistry
2,51082,"Creatinine, Urine",Urine,Chemistry
3,51106,Urine Creatinine,Urine,Chemistry
4,52000,Urine Creatinine,Urine,Chemistry


# 2. Check icd_code & icd_version

In [10]:
%%time

query = """
SELECT *
FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE (long_title LIKE '%Osteoporosis%')
OR (long_title LIKE '%Hyperparathyroidism%')
"""

run_query(query).tail(10)

Downloading: 100%|[32m██████████[0m|
CPU times: user 19.5 ms, sys: 1.68 ms, total: 21.2 ms
Wall time: 660 ms


Unnamed: 0,icd_code,icd_version,long_title
0,25200,9,"Hyperparathyroidism, unspecified"
1,73300,9,"Osteoporosis, unspecified"
2,E21,10,Hyperparathyroidism and other disorders of par...
3,E213,10,"Hyperparathyroidism, unspecified"
4,M80,10,Osteoporosis with current pathological fracture
5,M81,10,Osteoporosis without current pathological frac...


# 3. Data Collection

In [11]:
%%time

query = f"""
WITH OpPatients AS (
    SELECT DISTINCT subject_id
    FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
    WHERE (icd_code = '73300' AND icd_version = 9)
    OR (icd_code = 'M80' AND icd_version = 10)
    OR (icd_code = 'M81' AND icd_version = 10)
),
AllPatients AS (
    SELECT DISTINCT subject_id
    FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
),
LabEventsItem AS (
    SELECT subject_id, itemid, valuenum,
    FROM `physionet-data.mimiciv_hosp.labevents`
    WHERE (itemid IN (51106, 51082, 52000, 51077, 50965))
    AND (subject_id IN (SELECT subject_id FROM AllPatients))
),
LabEventsByHadmItem AS(
    SELECT
        subject_id,

        -- Columns for itemid 51106
        AVG(CASE WHEN itemid = 51106 THEN valuenum ELSE NULL END) AS avg_UrineCreatinine1,
        -- Columns for itemid 51082
        AVG(CASE WHEN itemid = 51082 THEN valuenum ELSE NULL END) AS avg_UrineCreatinine2,
        -- Columns for itemid 52000
        AVG(CASE WHEN itemid = 52000 THEN valuenum ELSE NULL END) AS avg_UrineCreatinine3,

        -- Columns for itemid 51077
        AVG(CASE WHEN itemid = 51077 THEN valuenum ELSE NULL END) AS avg_UrineCalcium,

        -- Columns for itemid 50965
        AVG(CASE WHEN itemid = 50965 THEN valuenum ELSE NULL END) AS avg_ParathyroidHormone,

    FROM LabEventsItem
    GROUP BY subject_id
    HAVING NOT (
        avg_UrineCreatinine1 IS NULL AND avg_UrineCalcium IS NULL AND avg_ParathyroidHormone IS NULL
        AND avg_UrineCreatinine2 IS NULL AND avg_UrineCreatinine3 IS NULL
    )
),
LabEventsPatient AS(
    SELECT
        t1.*, t2.gender, t2.anchor_age
    FROM LabEventsByHadmItem AS t1
    JOIN `physionet-data.mimiciv_hosp.patients` AS t2
    ON t1.subject_id = t2.subject_id
),
LabeledLabEvents AS (
    SELECT
        t1.*,
        CASE WHEN t2.subject_id IS NOT NULL THEN 1 ELSE 0 END AS Osteoporosis
    FROM LabEventsPatient AS t1
    LEFT JOIN OpPatients AS t2
    ON t1.subject_id = t2.subject_id
)

SELECT * FROM LabeledLabEvents
"""

df = run_query(query)
df.tail(10)

Downloading: 100%|[32m██████████[0m|
CPU times: user 3.54 s, sys: 273 ms, total: 3.82 s
Wall time: 10.5 s


Unnamed: 0,subject_id,avg_UrineCreatinine1,avg_UrineCreatinine2,avg_UrineCreatinine3,avg_UrineCalcium,avg_ParathyroidHormone,gender,anchor_age,Osteoporosis
49737,11973788,,81.666667,,,,F,71,0
49738,12579591,,47.0,,,,F,72,0
49739,15731508,,41.0,,,59.0,M,72,0
49740,18802748,,45.666667,,,,M,46,0
49741,13132968,,146.0,,,38.0,F,50,0
49742,19824731,,45.0,,12.7,58.428571,F,66,0
49743,13543245,,173.0,,,,F,91,0
49744,13774741,,124.666667,,,,F,91,0
49745,14476240,,35.666667,,,,F,64,0
49746,18539655,,133.0,,,,M,69,0


In [12]:
# Save DataFrame to CSV
df.to_csv('../data/raw_Ost_LabEvents.csv', index=False)