# Additional EDA
Random snippets from other notebooks to maintain cleanliness, but still save exploration

In [2]:
import pandas_gbq
from google.oauth2 import service_account
import json
import pandas as pd
from functools import reduce

In [3]:
# apply credentials
credentials = service_account.Credentials.from_service_account_file('../Patient-Similarity-credentials.json')
pandas_gbq.context.credentials = credentials
pandas_gbq.context.project = "patient-similarity"

## Extract Data 

In [4]:
# load the identified clinical variables
with open('../data/clinical variables.txt', 'r') as f:
    clinical_variables = json.load(f)

In [5]:
hepatitis = clinical_variables['lab tests']['hepatitis']
# Hepatitis needs to be cleaned and converted to numeric
# first we see what it includes
for test in hepatitis:
    q = f"""SELECT distinct B.Label, A.Value
        FROM `patient-similarity.mimic.labevents` as A
        left join `patient-similarity.mimic.d_labitems` as B
        on A.ITEMID = B.ITEMID
        where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
        and LABEL = "{test}"
        and A.HADM_ID is not null and A.Value is not null"""
    tmp = pandas_gbq.read_gbq(q)
    display(tmp)
    print(list(tmp.Value))

Unnamed: 0,Label,Value
0,Hepatitis B Surface Antibody,POSITIVE
1,Hepatitis B Surface Antibody,"POSITIVE,TITER IS > 450MIU/ML"
2,Hepatitis B Surface Antibody,NEGATIVE
3,Hepatitis B Surface Antibody,BORDERLINE POSITIVE -- C/W TITER OF ROUGHLY 10...
4,Hepatitis B Surface Antibody,"POSITIVE,TITER IS GREATER THAN 450 MIU/ML"
5,Hepatitis B Surface Antibody,"POSITIVE, TITER IS GREATER THAN 450 MIU/ML"
6,Hepatitis B Surface Antibody,"POSITIVE,TITER IS BETWEEN 100 AND 450 MIU/ML"
7,Hepatitis B Surface Antibody,BORDERLINE POSITIVE
8,Hepatitis B Surface Antibody,"POSITIVE,TITER IS GREATER THAN 450/MIU/ML"
9,Hepatitis B Surface Antibody,"POSITIVE,TITER IS > 450 MIU/ML"


['POSITIVE', 'POSITIVE,TITER IS > 450MIU/ML', 'NEGATIVE', 'BORDERLINE POSITIVE -- C/W TITER OF ROUGHLY 10 MIU/ML', 'POSITIVE,TITER IS GREATER THAN 450 MIU/ML', 'POSITIVE, TITER IS GREATER THAN 450 MIU/ML', 'POSITIVE,TITER IS BETWEEN 100 AND 450 MIU/ML', 'BORDERLINE POSITIVE', 'POSITIVE,TITER IS GREATER THAN 450/MIU/ML', 'POSITIVE,TITER IS > 450 MIU/ML', 'POSITIVE,TITER IS >450 MIU/ML', 'POSITIVE TITER IS > 450 MIU/ML', 'POSITIVE -- C/W TITER OF MORE THAN 100 MIU/ML', 'POSITIVE,TITER IS BETWEEN 150 AND 450 MIU/ML', 'POSITIVE,TITER IS BETWEEN 10 AND 100 MIU/ML', 'POSITIVE TITER IS GREATER THAN 450', 'POSITIVE, TITER IS >450 MIU/ML', 'POSITIVE TITER IS GREATER THAN 450 MIU/ML', 'POSITIVE, TITER IS BETWEEN 100 AND 450 MIU/ML', 'POSITIVE TITER IS >450 MIU/ML', 'BORDERLINE, CONSISTENT WITH A TITER OF 10 MIU/ML', 'POSITIVE - TITER IS GREATER THAN 450 MIU/ML', 'POSITIVE TITER IS GREATER THAN450MIU/ML', 'POSITIVE, TITER IS >450/MIU/ML', 'POSITIVE, TITER GREATER THAN 450MIU/ML', 'NEGATIVE - Leve

Unnamed: 0,Label,Value
0,Hepatitis B Surface Antigen,NEGATIVE
1,Hepatitis B Surface Antigen,POSITIVE


['NEGATIVE', 'POSITIVE']


Unnamed: 0,Label,Value
0,Hepatitis C Virus Antibody,POSITIVE
1,Hepatitis C Virus Antibody,NEGATIVE
2,Hepatitis C Virus Antibody,INDETERMINATE
3,Hepatitis C Virus Antibody,INDETERMINATE RESULT
4,Hepatitis C Virus Antibody,EQUIVOCAL


['POSITIVE', 'NEGATIVE', 'INDETERMINATE', 'INDETERMINATE RESULT', 'EQUIVOCAL']


Unnamed: 0,Label,Value
0,Hepatitis B Virus Core Antibody,NEGATIVE
1,Hepatitis B Virus Core Antibody,POSITIVE
2,Hepatitis B Virus Core Antibody,BORDERLINE POSITIVE
3,Hepatitis B Virus Core Antibody,EQUIVOCAL
4,Hepatitis B Virus Core Antibody,INDETERMINATE RESULT


['NEGATIVE', 'POSITIVE', 'BORDERLINE POSITIVE', 'EQUIVOCAL', 'INDETERMINATE RESULT']


In [6]:
# same thing for our antibodies 
antibodies = clinical_variables['lab tests']['antibodies (other)']
for test in antibodies:
    q = f"""SELECT distinct B.Label, A.Value
        FROM `patient-similarity.mimic.labevents` as A
        left join `patient-similarity.mimic.d_labitems` as B
        on A.ITEMID = B.ITEMID
        where SUBJECT_ID in (select subject_id from `patient-similarity.mimic.liver_pts`)
        and LABEL = "{test}"
        and A.HADM_ID is not null and A.Value is not null"""
    tmp = pandas_gbq.read_gbq(q)
    display(tmp)
    print(list(tmp.Value))

Unnamed: 0,Label,Value
0,Anti-Mitochondrial Antibody,NEGATIVE
1,Anti-Mitochondrial Antibody,POSITIVE


['NEGATIVE', 'POSITIVE']


Unnamed: 0,Label,Value
0,"Anti-Nuclear Antibody, Titer",1:1280
1,"Anti-Nuclear Antibody, Titer",1:40 PATTERN-SPECKLED
2,"Anti-Nuclear Antibody, Titer",1:40
3,"Anti-Nuclear Antibody, Titer",1:640
4,"Anti-Nuclear Antibody, Titer",1:160
5,"Anti-Nuclear Antibody, Titer",1:80
6,"Anti-Nuclear Antibody, Titer",1:80 PATTERN-DIFFUSE
7,"Anti-Nuclear Antibody, Titer",1:40 PATTERN-DIFFUSE
8,"Anti-Nuclear Antibody, Titer",1:640 PATTERN-DIFFUSE
9,"Anti-Nuclear Antibody, Titer",1:40 PATTERN-NUCLEOLAR


['1:1280', '1:40 PATTERN-SPECKLED', '1:40', '1:640', '1:160', '1:80', '1:80 PATTERN-DIFFUSE', '1:40 PATTERN-DIFFUSE', '1:640 PATTERN-DIFFUSE', '1:40 PATTERN-NUCLEOLAR', '1:320 PATTERN-DIFFUSE', 'GREATER THAN 1:1280', 'LESS THAN 1:40', '1:160 PATTERN-SPECKLED', '1:320', '1:320 PATTERN-NUCLEOLAR', '1:160 PATTERN-DIFFUSE']


Unnamed: 0,Label,Value
0,Anti-Nuclear Antibody,NEGATIVE
1,Anti-Nuclear Antibody,POSITIVE


['NEGATIVE', 'POSITIVE']


Unnamed: 0,Label,Value
0,Anti-Smooth Muscle Antibody,POSITIVE
1,Anti-Smooth Muscle Antibody,NEGATIVE
2,Anti-Smooth Muscle Antibody,POSITIVE AT A TITER OF 1:20
3,Anti-Smooth Muscle Antibody,POSITIVE -- C/W TITER OF MORE THAN 100 MIU/ML
4,Anti-Smooth Muscle Antibody,POSITIVE AT A TITER OF 1:320
5,Anti-Smooth Muscle Antibody,POSITIVE AT A TITER OF 1:40


['POSITIVE', 'NEGATIVE', 'POSITIVE AT A TITER OF 1:20', 'POSITIVE -- C/W TITER OF MORE THAN 100 MIU/ML', 'POSITIVE AT A TITER OF 1:320', 'POSITIVE AT A TITER OF 1:40']


In [5]:
pain = clinical_variables['chart events']['pain']
pain

['Pain Present', 'Pain Location', 'Pain Type', 'Pain Level', 'Pain Cause']

In [6]:
# convert pain level to ordinal 
q = """
SELECT distinct B.LABEL,  A.VALUE
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Pain Level')
and A.HADM_ID is not null and A.VALUE is not null
order by LABEL"""
list(pandas_gbq.read_gbq(q).VALUE)

['Worst',
 '10-Worst',
 'Mild to Moderate.',
 'Moderate to Severe',
 'Severe',
 '4-Mild to Mod',
 '0-None',
 'Mild to Moderate',
 'Severe to Worse',
 'Moderate to Severe.',
 '9-Severe-Worst',
 '7-Mod to Severe',
 '3-Mild to Mod',
 '2-Mild',
 'Mild ',
 'None to Mild',
 '5-Moderate',
 'None',
 'Unable to Score',
 'Moderate',
 'Unable to score',
 '6-Mod to Severe',
 '8-Severe',
 '1-None to Mild']