## Extract Data from Mimic III
Leveraging the variable list developed in "idClinicalVariables", I begin to extract the variables.

For each group of variables for which I am comparing similarity, I export a csv file. Each csv filename represents the category, subcategory, and datatype.  When generating the similarity matrices, I will read in each csv file, parse the type and generate 

In [18]:
import pandas_gbq
from google.oauth2 import service_account
import json
import pandas as pd
from functools import reduce

In [3]:
# apply credentials
credentials = service_account.Credentials.from_service_account_file('../Patient-Similarity-credentials.json')
pandas_gbq.context.credentials = credentials
pandas_gbq.context.project = "patient-similarity"

In [54]:
# load the identified clinical variables
with open('../data/clinical variables.txt', 'r') as f:
    clinical_variables = json.load(f)
    
clinical_variables

{'lab tests': {'relevant_vars': ['Alanine Aminotransferase (ALT)',
   'Asparate Aminotransferase (AST)',
   'Fibrinogen, Functional',
   'Platelet Count',
   'INR(PT)',
   'PT',
   'PTT',
   'Albumin',
   'Albumin, Ascites',
   'Bilirubin, Total',
   'Bilirubin, Total, Ascites',
   'Bilirubin, Direct',
   'Bilirubin, Indirect',
   'Alkaline Phosphatase',
   'Phosphate',
   'Lactate Dehydrogenase (LD)',
   'Lactate Dehydrogenase, Ascites',
   'Lactate',
   'Gamma Glutamyltransferase',
   'Protein',
   'Protein, Total',
   'Total Protein, Urine',
   'Total Protein, Ascites',
   'Ammonia',
   'Hepatitis B Surface Antibody',
   'Hepatitis B Surface Antigen',
   'Hepatitis C Virus Antibody',
   'Hepatitis B Virus Core Antibody',
   'Alpha-Fetoprotein',
   'Iron',
   'Iron Binding Capacity, Total',
   'Transferrin',
   'Ferritin',
   'Anti-Mitochondrial Antibody',
   'Anti-Nuclear Antibody, Titer',
   'Anti-Nuclear Antibody',
   'Anti-Smooth Muscle Antibody',
   'Acetaminophen',
   'Creatini

## Extract Lab Events
Groups: enyzmes, proteins, blood information, hepatitis, antibodies, breakdown products, and other

In [29]:
#General query to explore values
"""SELECT A.SUBJECT_ID , A.HADM_ID , A.ITEMID , B.LABEL, A.CHARTTIME, A.VALUENUM
FROM `patient-similarity.mimic.labevents` as A
left join `patient-similarity.mimic.d_labitems` as B
on A.ITEMID = B.ITEMID
where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Protein', 'Protein, Total', 'Total Protein, Urine',
'Total Protein, Ascites', 'Alpha-Fetoprotein')
and A.HADM_ID is not null and VALUENum is not null
order by subject_id , A.HADM_ID,  B.LABEL ,  A.CHARTTIME """

"SELECT A.SUBJECT_ID , A.HADM_ID , A.ITEMID , B.LABEL, A.CHARTTIME, A.VALUENUM\nFROM `patient-similarity.mimic.labevents` as A\nleft join `patient-similarity.mimic.d_labitems` as B\non A.ITEMID = B.ITEMID\nwhere SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)\nand LABEL in ('Protein', 'Protein, Total', 'Total Protein, Urine',\n'Total Protein, Ascites', 'Alpha-Fetoprotein')\nand A.HADM_ID is not null and VALUENum is not null\norder by subject_id , A.HADM_ID,  B.LABEL ,  A.CHARTTIME "

### Enzymes

In [5]:
enzymes = clinical_variables['lab tests']['enzymes']
enzymes

['Alanine Aminotransferase (ALT)',
 'Asparate Aminotransferase (AST)',
 'Alkaline Phosphatase',
 'Lactate Dehydrogenase (LD)',
 'Lactate Dehydrogenase, Ascites',
 'Lactate']

In [7]:
# we extract the average, min, and max of these enzymes for each patient
q = """select A.SUBJECT_ID, 
min(case when B.LABEL = 'Alanine Aminotransferase (ALT)' then A.Valuenum else null end) as min_ALT,
max(case when B.LABEL = 'Alanine Aminotransferase (ALT)' then A.Valuenum else null end) as max_ALT,
avg(case when B.LABEL = 'Alanine Aminotransferase (ALT)' then A.Valuenum else null end) as avg_ALT,

min(case when B.LABEL =  "Asparate Aminotransferase (AST)" then A.Valuenum else null end) as min_ast,
max(case when B.LABEL =  "Asparate Aminotransferase (AST)" then A.Valuenum else null end) as max_ast,
avg(case when B.LABEL =  "Asparate Aminotransferase (AST)" then A.Valuenum else null end) as avg_ast,

min(case when B.LABEL =  'Alkaline Phosphatase' then A.Valuenum else null end) as min_alkph,
max(case when B.LABEL =  'Alkaline Phosphatase' then A.Valuenum else null end) as max_alkph,
avg(case when B.LABEL =  'Alkaline Phosphatase' then A.Valuenum else null end) as avg_alkph,

min(case when B.LABEL =  'Lactate Dehydrogenase (LD)' then A.Valuenum else null end) as min_ld,
max(case when B.LABEL =  'Lactate Dehydrogenase (LD)' then A.Valuenum else null end) as max_ld,
avg(case when B.LABEL =  'Lactate Dehydrogenase (LD)' then A.Valuenum else null end) as avg_ld,

min(case when B.LABEL =  'Lactate Dehydrogenase, Ascites' then A.Valuenum else null end) as min_ld_ascites,
max(case when B.LABEL = 'Lactate Dehydrogenase, Ascites' then A.Valuenum else null end) as max_ld_ascites,
avg(case when B.LABEL =  'Lactate Dehydrogenase, Ascites' then A.Valuenum else null end) as avg_ld_ascites,

min(case when B.LABEL =  "Lactate" then A.Valuenum else null end) as min_lactate,
max(case when B.LABEL =  "Lactate" then A.Valuenum else null end) as max_lactate,
avg(case when B.LABEL =  "Lactate" then A.Valuenum else null end) as avg_lactate,

from `patient-similarity.mimic.labevents` as A
left join `patient-similarity.mimic.d_labitems` as B
using(ITEMID)
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and HADM_ID is not null and valuenum is not null
group by subject_id
order by subject_id"""

enzymes = pandas_gbq.read_gbq(q)
enzymes.head()

Unnamed: 0,SUBJECT_ID,min_ALT,max_ALT,avg_ALT,min_ast,max_ast,avg_ast,min_alkph,max_alkph,avg_alkph,min_ld,max_ld,avg_ld,min_ld_ascites,max_ld_ascites,avg_ld_ascites,min_lactate,max_lactate,avg_lactate
0,4,24.0,28.0,26.0,64.0,69.0,66.5,837.0,994.0,915.5,330.0,330.0,330.0,,,,2.1,2.1,2.1
1,52,22.0,30.0,25.636364,43.0,58.0,49.363636,55.0,100.0,77.363636,229.0,343.0,285.888889,,,,,,
2,78,52.0,52.0,52.0,101.0,101.0,101.0,129.0,129.0,129.0,,,,,,,,,
3,117,46.0,2442.0,192.869565,83.0,9295.0,615.913043,103.0,364.0,164.434783,196.0,6570.0,577.473684,,,,1.2,21.9,8.90625
4,140,,,,,,,,,,,,,,,,,,


In [8]:
enzymes.to_csv("../data/patientData/lab tests_cross sect_enzymes.csv")

### Proteins

In [9]:
proteins = clinical_variables['lab tests']['proteins']
proteins

['Protein',
 'Protein, Total',
 'Total Protein, Urine',
 'Total Protein, Ascites',
 'Alpha-Fetoprotein']

In [10]:
# min, max, and average for protein, but then just average for the rest
q = """select A.SUBJECT_ID, 
min(case when B.LABEL = 'Protein' then A.Valuenum else null end) as min_protein,
max(case when B.LABEL = 'Protein' then A.Valuenum else null end) as max_protein,
avg(case when B.LABEL = 'Protein' then A.Valuenum else null end) as avg_protein,

avg(case when B.LABEL =  'Protein, Total' then A.Valuenum else null end) as avg_tot_protein,

avg(case when B.LABEL =  'Total Protein, Urine' then A.Valuenum else null end) as avg_urine_prot,

avg(case when B.LABEL =  'Total Protein, Ascites' then A.Valuenum else null end) as avg_ascite_prot,

avg(case when B.LABEL =  'Alpha-Fetoprotein' then A.Valuenum else null end) as avg_fetoprotein,

from `patient-similarity.mimic.labevents` as A
left join `patient-similarity.mimic.d_labitems` as B
using(ITEMID)
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and HADM_ID is not null and valuenum is not null
group by subject_id
order by subject_id"""
proteins = pandas_gbq.read_gbq(q)
proteins.head()

Unnamed: 0,SUBJECT_ID,min_protein,max_protein,avg_protein,avg_tot_protein,avg_urine_prot,avg_ascite_prot,avg_fetoprotein
0,4,30.0,30.0,30.0,,,,
1,52,,,,,18.0,,
2,78,,,,,,,
3,117,500.0,500.0,500.0,5.05,145.0,,
4,140,,,,,,,


In [11]:
proteins.to_csv("../data/patientData/lab tests_cross sect_proteins.csv")

### Blood information

In [12]:
blood = clinical_variables['lab tests']['blood info']
blood

['Fibrinogen, Functional',
 'Platelet Count',
 'INR(PT)',
 'PT',
 'PTT',
 'Iron',
 'Iron Binding Capacity, Total',
 'Transferrin',
 'Ferritin']

In [14]:
# min, max, average for fibrinoden, platelet, int, pt, ptt
q = """
select A.SUBJECT_ID, 
min(case when B.LABEL = 'Fibrinogen, Functional' then A.Valuenum else null end) as min_fibrinogen,
max(case when B.LABEL = 'Fibrinogen, Functional' then A.Valuenum else null end) as max_fibrinogen,
avg(case when B.LABEL = 'Fibrinogen, Functional' then A.Valuenum else null end) as avg_fibrinogen,

min(case when B.LABEL = 'Platelet Count' then A.Valuenum else null end) as min_platelet,
max(case when B.LABEL = 'Platelet Count' then A.Valuenum else null end) as max_platelet,
avg(case when B.LABEL = 'Platelet Count' then A.Valuenum else null end) as avg_platelet,

min(case when B.LABEL = 'INR(PT)' then A.Valuenum else null end) as min_inr,
max(case when B.LABEL = 'INR(PT)' then A.Valuenum else null end) as max_inr,
avg(case when B.LABEL = 'INR(PT)' then A.Valuenum else null end) as avg_inr,

min(case when B.LABEL = 'PT' then A.Valuenum else null end) as min_pt,
max(case when B.LABEL = 'PT' then A.Valuenum else null end) as max_pt,
avg(case when B.LABEL = 'PT' then A.Valuenum else null end) as avg_pt,

min(case when B.LABEL = 'PTT' then A.Valuenum else null end) as min_ptt,
max(case when B.LABEL = 'PTT' then A.Valuenum else null end) as max_ptt,
avg(case when B.LABEL = 'PTT' then A.Valuenum else null end) as avg_ptt,

from `patient-similarity.mimic.labevents` as A
left join `patient-similarity.mimic.d_labitems` as B
using(ITEMID)
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and HADM_ID is not null and valuenum is not null
group by subject_id
order by subject_id"""

blood_desc = pandas_gbq.read_gbq(q)
blood_desc.head()


Unnamed: 0,SUBJECT_ID,min_fibrinogen,max_fibrinogen,avg_fibrinogen,min_platelet,max_platelet,avg_platelet,min_inr,max_inr,avg_inr,min_pt,max_pt,avg_pt,min_ptt,max_ptt,avg_ptt
0,4,,,,201.0,388.0,286.428571,1.0,1.1,1.05,12.3,12.8,12.55,31.3,33.2,32.25
1,52,,,,47.0,115.0,72.583333,1.7,2.7,2.241667,18.4,26.5,22.633333,37.3,48.4,42.633333
2,78,,,,45.0,53.0,47.75,1.2,1.2,1.2,13.2,13.7,13.45,37.7,39.0,38.35
3,117,79.0,253.0,158.25,24.0,98.0,52.565217,1.5,3.9,2.14375,15.0,24.7,18.25625,34.8,150.0,56.434091
4,140,,,,65.0,86.0,72.666667,1.0,1.3,1.15,11.8,14.5,13.15,28.9,31.5,30.2


In [21]:
# first and last for iron, iron binding, transferrin, ferritin (they occur once or twice per visit)
q = """select SUBJECT_ID, 
/* manually transposing the data */
max(case when LABEL = "Ferritin" then ValueNum else null end) as first_ferritin,
max(case when LABEL = 'Iron' then ValueNum else null end) as first_iron,
max(case when LABEL = 'Iron Binding Capacity, Total' then ValueNum else null end) as first_iron_bind,
max(case when LABEL = 'Transferrin' then ValueNum else null end) as first_transferrin
from (
  SELECT A.SUBJECT_ID , C.LABEL, A.ValueNum, A.charttime
  from `patient-similarity.mimic.labevents` as A
  /* inner join just the min or max date to only include those value */
  inner join (select
     subject_id, itemID, min(CHARTTIME) as charttime
    FROM `patient-similarity.mimic.labevents` group by subject_id, ITEMID) as B
    using(subject_id, itemid, charttime)
    
  /* typical get the label and clean the data */
  left join `patient-similarity.mimic.d_labitems` as C
  on A.ITEMID = C.ITEMID
  where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
  and LABEL in ('Iron','Iron Binding Capacity, Total', 'Transferrin', 'Ferritin')
  and A.HADM_ID is not null and VALUENum is not null
  order by subject_id,  C.LABEL 
) group by subject_id 
order by subject_id"""

blood_first = pandas_gbq.read_gbq(q)
blood_first.head()

Unnamed: 0,SUBJECT_ID,first_ferritin,first_iron,first_iron_bind,first_transferrin
0,52,640.0,90.0,185.0,142.0
1,188,107.0,38.0,173.0,133.0
2,252,816.0,84.0,130.0,100.0
3,267,348.0,8.0,112.0,86.0
4,279,768.0,156.0,237.0,182.0


In [22]:
q = """select SUBJECT_ID, 
/* manually transposing the data */
max(case when LABEL = "Ferritin" then ValueNum else null end) as last_ferritin,
max(case when LABEL = 'Iron' then ValueNum else null end) as last_iron,
max(case when LABEL = 'Iron Binding Capacity, Total' then ValueNum else null end) as last_iron_bind,
max(case when LABEL = 'Transferrin' then ValueNum else null end) as last_transferrin
from (
  SELECT A.SUBJECT_ID , C.LABEL, A.ValueNum, A.charttime
  from `patient-similarity.mimic.labevents` as A
  /* inner join just the min or max date to only include those value */
  inner join (select
     subject_id, itemID, max(CHARTTIME) as charttime
    FROM `patient-similarity.mimic.labevents` group by subject_id, ITEMID) as B
    using(subject_id, itemid, charttime)
    
  /* typical get the label and clean the data */
  left join `patient-similarity.mimic.d_labitems` as C
  on A.ITEMID = C.ITEMID
  where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
  and LABEL in ('Iron','Iron Binding Capacity, Total', 'Transferrin', 'Ferritin')
  and A.HADM_ID is not null and VALUENum is not null
  order by subject_id,  C.LABEL 
) group by subject_id 
order by subject_id"""

blood_last = pandas_gbq.read_gbq(q)
blood_last.head()

Unnamed: 0,SUBJECT_ID,last_ferritin,last_iron,last_iron_bind,last_transferrin
0,188,515.0,142.0,159.0,122.0
1,236,541.0,24.0,204.0,157.0
2,252,,84.0,130.0,100.0
3,267,348.0,8.0,112.0,86.0
4,279,768.0,156.0,237.0,182.0


In [24]:
# now merge and export
blood =  reduce(lambda left,right: pd.merge(left,right,on=['SUBJECT_ID'],
                                            how='left'), [blood_desc, blood_first, blood_last])
blood.describe()

Unnamed: 0,SUBJECT_ID,min_fibrinogen,max_fibrinogen,avg_fibrinogen,min_platelet,max_platelet,avg_platelet,min_inr,max_inr,avg_inr,...,max_ptt,avg_ptt,first_ferritin,first_iron,first_iron_bind,first_transferrin,last_ferritin,last_iron,last_iron_bind,last_transferrin
count,2880.0,1597.0,1597.0,1597.0,2878.0,2878.0,2878.0,2828.0,2828.0,2828.0,...,2823.0,2823.0,916.0,1011.0,992.0,991.0,897.0,977.0,959.0,959.0
mean,42156.676736,205.816844,326.929618,258.417616,101.73975,265.646282,164.22966,1.277542,2.907822,1.692092,...,70.452781,40.767836,758.977183,70.663699,205.184476,157.812311,755.858974,66.222108,198.993743,153.042753
std,29862.281358,144.740984,177.426445,140.79819,82.36037,184.785503,108.768297,0.388529,4.714244,0.94086,...,44.273897,13.960098,1986.133543,57.436316,87.362458,67.236016,1702.962718,54.794445,87.295935,67.155424
min,4.0,35.0,43.0,43.0,5.0,24.0,15.717647,0.0,0.9,0.85,...,19.6,19.6,3.9,6.0,18.0,14.0,4.1,5.0,17.0,13.0
25%,16030.0,105.0,199.0,161.0,44.0,142.0,87.145833,1.0,1.4,1.233333,...,34.8,30.8325,146.75,28.0,144.0,111.0,170.0,27.0,138.0,106.0
50%,31490.0,163.0,281.0,222.285714,77.0,219.0,136.324561,1.2,1.9,1.475,...,51.0,37.1,373.0,52.0,191.0,147.0,393.0,49.0,185.0,142.0
75%,67943.25,259.0,409.0,316.0,137.0,334.0,215.994565,1.4,3.0,1.895982,...,100.35,47.548182,805.75,96.0,259.0,199.0,846.0,88.0,250.5,192.5
max,99923.0,1246.0,1246.0,1246.0,806.0,2315.0,1058.037037,7.2,88.6,28.333333,...,150.0,150.0,35657.0,367.0,564.0,434.0,35657.0,367.0,564.0,434.0


In [25]:
blood.head()

Unnamed: 0,SUBJECT_ID,min_fibrinogen,max_fibrinogen,avg_fibrinogen,min_platelet,max_platelet,avg_platelet,min_inr,max_inr,avg_inr,...,max_ptt,avg_ptt,first_ferritin,first_iron,first_iron_bind,first_transferrin,last_ferritin,last_iron,last_iron_bind,last_transferrin
0,4,,,,201.0,388.0,286.428571,1.0,1.1,1.05,...,33.2,32.25,,,,,,,,
1,52,,,,47.0,115.0,72.583333,1.7,2.7,2.241667,...,48.4,42.633333,640.0,90.0,185.0,142.0,,,,
2,78,,,,45.0,53.0,47.75,1.2,1.2,1.2,...,39.0,38.35,,,,,,,,
3,117,79.0,253.0,158.25,24.0,98.0,52.565217,1.5,3.9,2.14375,...,150.0,56.434091,,,,,,,,
4,140,,,,65.0,86.0,72.666667,1.0,1.3,1.15,...,31.5,30.2,,,,,,,,


In [26]:
blood.to_csv("../data/patientData/lab tests_cross sect_blood.csv")

### Hepatitis

In [30]:
hepatitis = clinical_variables['lab tests']['hepatitis']
hepatitis

['Hepatitis B Surface Antibody',
 'Hepatitis B Surface Antigen',
 'Hepatitis C Virus Antibody',
 'Hepatitis B Virus Core Antibody']

In [37]:
# load in our variable mappings 
from variableMaps.hepatitisMap import *
hep_b_antigen

{'NEGATIVE': 0, 'POSITIVE': 0}

In [41]:
# we extract the latest hepatitis status
q = """select SUBJECT_ID, 
/* manually transposing the data */
max(case when LABEL = 'Hepatitis B Surface Antibody' then Value else null end) as hep_b_surf_antibody,
max(case when LABEL = 'Hepatitis B Surface Antigen' then Value else null end) as hep_b_surf_antigen,
max(case when LABEL = 'Hepatitis C Virus Antibody' then Value else null end) as hep_c_antibody,
max(case when LABEL = 'Hepatitis B Virus Core Antibody' then Value else null end) as hep_b_virus_antibody
from (
  SELECT A.SUBJECT_ID , C.LABEL, A.Value, A.charttime
  from `patient-similarity.mimic.labevents` as A
  /* inner join just the min or max date to only include those value */
  inner join (select
     subject_id, itemID, max(CHARTTIME) as charttime
    FROM `patient-similarity.mimic.labevents` group by subject_id, ITEMID) as B
    using(subject_id, itemid, charttime)
    
  /* typical get the label and clean the data */
  left join `patient-similarity.mimic.d_labitems` as C
  on A.ITEMID = C.ITEMID
  where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
  and LABEL in ('Hepatitis B Surface Antibody','Hepatitis B Surface Antigen',
 'Hepatitis C Virus Antibody', 'Hepatitis B Virus Core Antibody')
  and A.HADM_ID is not null and VALUE is not null
  order by subject_id,  C.LABEL 
) group by subject_id 
order by subject_id"""
hepatitis = pandas_gbq.read_gbq(q)
hepatitis.head()

Unnamed: 0,SUBJECT_ID,hep_b_surf_antibody,hep_b_surf_antigen,hep_c_antibody,hep_b_virus_antibody
0,226,,NEGATIVE,,NEGATIVE
1,252,POSITIVE,NEGATIVE,NEGATIVE,NEGATIVE
2,279,NEGATIVE,NEGATIVE,NEGATIVE,NEGATIVE
3,314,NEGATIVE,NEGATIVE,POSITIVE,POSITIVE
4,357,POSITIVE,NEGATIVE,NEGATIVE,NEGATIVE


In [42]:
# now we map
hepatitis = hepatitis.replace({
    "hep_b_surf_antibody":hep_b_antibody,
    "hep_b_surf_antigen": hep_b_antigen,
    "hep_c_antibody": Hep_c_antibody,
    "hep_b_virus_antibody":hep_b_virus_antibody
}).fillna(-1)
hepatitis.head()

Unnamed: 0,SUBJECT_ID,hep_b_surf_antibody,hep_b_surf_antigen,hep_c_antibody,hep_b_virus_antibody
0,226,-1,0.0,-1.0,0.0
1,252,4,0.0,0.0,0.0
2,279,0,0.0,0.0,0.0
3,314,0,0.0,2.0,3.0
4,357,4,0.0,0.0,0.0


In [43]:
hepatitis.to_csv("../data/patientData/lab tests_cross sect_hepatitis.csv")

## Antibodies

In [46]:
antibodies = clinical_variables['lab tests']['antibodies (other)']
antibodies

['Anti-Mitochondrial Antibody',
 'Anti-Nuclear Antibody, Titer',
 'Anti-Nuclear Antibody',
 'Anti-Smooth Muscle Antibody']

In [47]:
from variableMaps.antibodyMap import *
nuclear

{'NEGATIVE': 0, 'POSITIVE': 1}

In [48]:
# we extract the latest status
q = """select SUBJECT_ID, 
/* manually transposing the data */
max(case when LABEL = 'Anti-Mitochondrial Antibody' then Value else null end) as anti_mitochondrial,
max(case when LABEL = 'Anti-Nuclear Antibody, Titer' then Value else null end) as anti_nuclear_titer,
max(case when LABEL = 'Anti-Nuclear Antibody' then Value else null end) as anti_nuclear,
max(case when LABEL = 'Anti-Smooth Muscle Antibody' then Value else null end) as anti_smooth_muscle
from (
  SELECT A.SUBJECT_ID , C.LABEL, A.Value, A.charttime
  from `patient-similarity.mimic.labevents` as A
  /* inner join just the min or max date to only include those value */
  inner join (select
     subject_id, itemID, max(CHARTTIME) as charttime
    FROM `patient-similarity.mimic.labevents` group by subject_id, ITEMID) as B
    using(subject_id, itemid, charttime)
    
  /* typical get the label and clean the data */
  left join `patient-similarity.mimic.d_labitems` as C
  on A.ITEMID = C.ITEMID
  where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
  and LABEL in ('Anti-Mitochondrial Antibody','Anti-Nuclear Antibody, Titer',
 'Anti-Nuclear Antibody', 'Anti-Smooth Muscle Antibody')
  and A.HADM_ID is not null and VALUE is not null
  order by subject_id,  C.LABEL 
) group by subject_id 
order by subject_id"""

antibodies = pandas_gbq.read_gbq(q)

In [49]:
antibodies = antibodies.replace({
    "anti_mitochondrial":mitochondrial,
    "anti_nuclear_titer": nuclear_titer,
    "anti_nuclear": nuclear,
    "anti_smooth_muscle":muscle
}).fillna(-1)
antibodies.head()

Unnamed: 0,SUBJECT_ID,anti_mitochondrial,anti_nuclear_titer,anti_nuclear,anti_smooth_muscle
0,252,-1.0,1.0,1.0,-1.0
1,279,0.0,-1.0,0.0,-1.0
2,357,-1.0,1.0,1.0,-1.0
3,433,-1.0,3.0,1.0,-1.0
4,634,1.0,-1.0,0.0,-1.0


In [50]:
antibodies.to_csv("../data/patientData/lab tests_cross sect_antibodies.csv")

## Breakdown products
i.e., products broken down through enzymatic processes such as creatinine@

In [55]:
breakdown = clinical_variables['lab tests']['breakdown products']
breakdown

['Albumin',
 'Albumin, Ascites',
 'Bilirubin, Total',
 'Bilirubin, Total, Ascites',
 'Bilirubin, Direct',
 'Bilirubin, Indirect',
 'Creatinine',
 'Creatine Kinase (CK)',
 'Creatine Kinase, MB Isoenzyme',
 'Creatinine, Urine']

In [56]:
# max, min, average for albumin, bilirubin, total, and creatinine
# average for the rest 
q = """select A.SUBJECT_ID, 
/* Albumin */
min(case when B.LABEL = 'Albumin' then A.Valuenum else null end) as min_albumin,
max(case when B.LABEL = 'Albumin' then A.Valuenum else null end) as max_albumin,
avg(case when B.LABEL = 'Albumin' then A.Valuenum else null end) as avg_albumin,

avg(case when B.LABEL = 'Albumin, Ascites' then A.Valuenum else null end) as avg_alb_ascites,

/* Bilirubin */
min(case when B.LABEL = 'Bilirubin, Total' then A.Valuenum else null end) as min_bilirubin,
max(case when B.LABEL = 'Bilirubin, Total' then A.Valuenum else null end) as max_bilirubin,
avg(case when B.LABEL = 'Bilirubin, Total' then A.Valuenum else null end) as avg_bilirubin,

avg(case when B.LABEL = 'Bilirubin, Total, Ascites' then A.Valuenum else null end) as avg_bili_ascites,

avg(case when B.LABEL = 'Bilirubin, Direct' then A.Valuenum else null end) as avg_bili_direct,

avg(case when B.LABEL = 'Bilirubin, Indirect' then A.Valuenum else null end) as avg_bili_indirect,

/* Creatinine */
min(case when B.LABEL = 'Creatinine' then A.Valuenum else null end) as min_creatinine,
max(case when B.LABEL = 'Creatinine' then A.Valuenum else null end) as max_creatinine,
avg(case when B.LABEL = 'Creatinine' then A.Valuenum else null end) as avg_creatinine,

avg(case when B.LABEL = 'Creatine Kinase (CK)' then A.Valuenum else null end) as avg_ck,

avg(case when B.LABEL = 'Creatine Kinase, MB Isoenzyme' then A.Valuenum else null end) as avg_ck_iso,

avg(case when B.LABEL = 'Creatinine, Urine' then A.Valuenum else null end) as avg_creat_urine,

from `patient-similarity.mimic.labevents` as A
left join `patient-similarity.mimic.d_labitems` as B
using(ITEMID)
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and HADM_ID is not null and valuenum is not null
group by subject_id
order by subject_id"""
breakdown = pandas_gbq.read_gbq(q)
breakdown.head()

Unnamed: 0,SUBJECT_ID,min_albumin,max_albumin,avg_albumin,avg_alb_ascites,min_bilirubin,max_bilirubin,avg_bilirubin,avg_bili_ascites,avg_bili_direct,avg_bili_indirect,min_creatinine,max_creatinine,avg_creatinine,avg_ck,avg_ck_iso,avg_creat_urine
0,4,2.8,2.8,2.8,,1.9,2.2,2.05,,1.5,0.55,0.4,0.5,0.45,,,
1,52,3.0,3.2,3.1,,8.2,15.7,11.363636,,,,0.8,3.5,1.6,,,125.5
2,78,2.7,3.1,2.866667,,0.8,0.8,0.8,,,,0.5,0.8,0.6,,,
3,117,1.7,3.8,2.733333,,3.1,54.0,30.846154,,14.728571,9.885714,0.4,4.1,1.836735,45.0,,137.0
4,140,,,,,,,,,,,0.7,0.8,0.766667,65.0,4.0,


In [57]:
breakdown.to_csv("../data/patientData/lab tests_cross sect_breakdown products.csv")

## Other Lab Tests

In [58]:
clinical_variables['lab tests']['other']

['Phosphate',
 'Gamma Glutamyltransferase',
 'Ammonia',
 'Acetaminophen',
 'Sodium',
 'Potassium']