#### SQL Prompts to create 'master' table for EDA. Contains all hospital events, EKGs, and labels

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

In [None]:
from google.colab import userdata
projectid=userdata.get('project_ID')

In [None]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import os
import db_dtypes

In [16]:
#filtering data to adults with non-missing age
%%bigquery --project={projectid}
CREATE OR REPLACE TABLE `{projectid}.ekg_labels.icd_labels` AS
SELECT *
FROM `ekg-classification.ekg_labels.records_w_diag_icd10`
WHERE age > 17 AND age IS NOT NULL
ORDER BY study_id ASC, subject_id ASC, ecg_time ASC;

Executing query with job ID: 98428ff4-e95b-4465-80d9-02ad8b06ec66
Query executing: 0.23s


ERROR:
 400 Invalid project ID '{projectid}'. Project IDs must contain 6-63 lowercase letters, digits, or dashes. Some project IDs also include domain name separated by a colon. IDs must start with a letter and may not end with a dash.; reason: invalid, location: {projectid}.ekg_labels.icd_labels, message: Invalid project ID '{projectid}'. Project IDs must contain 6-63 lowercase letters, digits, or dashes. Some project IDs also include domain name separated by a colon. IDs must start with a letter and may not end with a dash.

Location: US
Job ID: 98428ff4-e95b-4465-80d9-02ad8b06ec66



In [None]:
#Creates dataset with all possible ED visits. Induces duplicates and does not provide subject_id for admission + ICU patients. Separate join and re-run
%%bigquery --project={projectid}
CREATE OR REPLACE TABLE `{projectid}.ekg_data.all_events` AS
SELECT
  a.*,
  b.* EXCEPT(subject_id, hadm_id, race),
  c.* EXCEPT(subject_id, hadm_id, stay_id, intime, outtime),
  d.* EXCEPT(subject_id)
from `physionet-data.mimiciv_ed.edstays` a
full outer join
`physionet-data.mimiciv_3_1_hosp.admissions` b on (a.subject_id=b.subject_id)
left join
`physionet-data.mimiciv_3_1_icu.icustays` c on (b.subject_id=c.subject_id AND b.hadm_id=c.hadm_id)
left join
`physionet-data.mimiciv_ecg.record_list` d on (a.subject_id=d.subject_id);

In [None]:
#Missing values would be subjects that were in hosp_admissions (and ICU) tables but not ED
%%bigquery --project={projectid}
SELECT COUNT(*) AS missing_ids
FROM `{projectid}.ekg_data.all_events`
WHERE subject_id IS NULL;

In [None]:
#Updated join to fix missing subject_Ids
%%bigquery --project={projectid}
CREATE OR REPLACE TABLE `{projectid}.ekg_data.all_events` AS
WITH ed_hosp AS (
  SELECT
    COALESCE(a.subject_id, b.subject_id) AS subject_id1,
    COALESCE(a.hadm_id,   b.hadm_id)    AS hadm_id1,
    a.*,
    b.* EXCEPT(subject_id, hadm_id, race)
  FROM `physionet-data.mimiciv_ed.edstays` AS a
  FULL OUTER JOIN `physionet-data.mimiciv_3_1_hosp.admissions` AS b
    ON (a.subject_id = b.subject_id
   AND a.hadm_id = b.hadm_id)
)

SELECT
  v.subject_id1,
  v.hadm_id1,
  v.* EXCEPT(subject_id1, hadm_id1),
  c.* EXCEPT(subject_id, hadm_id, stay_id, intime, outtime),
  d.* EXCEPT(subject_id, file_name, study_id, ecg_time),
  e.* EXCEPT(subject_id, file_name, study_id, gender),
  d.file_name AS ecg_file,
  e.file_name AS label_file,
  d.study_id AS ecg_id,
  e.study_id AS label_id
FROM ed_hosp AS v
LEFT JOIN `physionet-data.mimiciv_3_1_icu.icustays` AS c
  ON (v.subject_id1 = c.subject_id
  AND v.hadm_id1 = c.hadm_id)
LEFT JOIN `physionet-data.mimiciv_ecg.record_list` AS d
  ON v.subject_id1 = d.subject_id
LEFT JOIN `ekg-classification.ekg_labels.icd_labels` AS e
  ON v.subject_id1 = e.subject_id;

In [None]:
#Check missingness
%%bigquery --project={projectid}
SELECT COUNT(*) AS missing_ids
FROM `{projectid}.ekg_data.all_events`
WHERE subject_id1 IS NULL;

#### *Updated join fixed missing subject_Ids when subject was in either ED OR HOSP table, but not both*

###### **Note: these prompts will nopt run with {projectid} in the sql line creating/replacing a table because SQL does not recognize {}. This is intentionally hidden for pushing this code to Github.**