# Extract Chart Events from MIMIC III
Leveraging the variable list developed in "idClinicalVariables", I begin to extract the variables.

For each group of variables for which I am comparing similarity, I export a csv file. Each csv filename represents the category, subcategory, and datatype.  When generating the similarity matrices, I will read in each csv file, parse the type and generate 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas_gbq
from google.oauth2 import service_account
import json
import pandas as pd
from functools import reduce

In [3]:
# apply credentials
credentials = service_account.Credentials.from_service_account_file('../Patient-Similarity-credentials.json')
pandas_gbq.context.credentials = credentials
pandas_gbq.context.project = "patient-similarity"

In [4]:
# load the identified clinical variables
with open('../data/clinical variables.txt', 'r') as f:
    clinical_variables = json.load(f)
    
clinical_variables['chart events']

{'physical assessment': ['Abdominal Assessment',
  'Skin Color',
  'Skin Condition',
  'Speech',
  'Gag Reflex',
  'Cough Reflex',
  'Oral Cavity',
  'Bowel Sounds',
  'Braden Moisture'],
 'activity': ['Activity',
  'Braden Mobility',
  'Braden Activity',
  'Activity Tolerance'],
 'pain': ['Pain Present',
  'Pain Location',
  'Pain Type',
  'Pain Level',
  'Pain Cause'],
 'diet': ['Braden Nutrition',
  'Diet Type',
  'Daily Weight',
  'Previous WeightF',
  'Admit Wt',
  'Previous Weight',
  'Appetite',
  'Weight Change',
  'Special diet'],
 'demographics': ['Marital Status', 'Family Communication', 'Gender', 'Race'],
 'heart': ['Heart Rate', 'Heart Rhythm'],
 'lung': ['Respiratory Rate',
  'RUL Lung Sounds',
  'LLL Lung Sounds',
  'RLL Lung Sounds',
  'LUL Lung Sounds',
  'Respiratory Pattern',
  'Respiratory Effort'],
 'medical history': ['Past medical history',
  'CV - past medical history',
  'Mental status',
  'Recreational drug use']}

In [84]:
# import our variable mapping
from variableMaps.chartEventsMap import *
chartMap

{'Occ. Moist': 'Occasionally Moist',
 'Consist. Moist': 'Consistently Moist',
 'Intubated/trach': 'Intubated/trached',
 'Walks Occasional': 'Walks Occasionally',
 'Comp. Immobile': 'Completely Immobile',
 'Cough/DeepBreath': 'Cough/Deep Breath',
 'Prob. Inadequate': 'Probably Inadequate',
 'Clear liquid': 'Clear Liquid',
 'House - Regular': 'House',
 'NAS/Low Cholest': 'NAS/Low Cholesterol',
 'Fam Talked to RN': 'Family Talked to RN',
 'Fam Talked to MD': 'Family Talked to MD',
 'Family Conferenc': 'Family Conference',
 'SocServ Involved': 'Social Service Involved',
 'S': 'Single',
 'M': 'Married',
 'W': 'Widowed',
 'Partner': 'Married',
 'D': 'Divorced',
 's': 'Single',
 'White - Other European': 'White',
 'White - Russian': 'White',
 'White - Brazilian': 'White',
 'Portuguese': 'White',
 'American Indian / Alaska Native - Federally Recognized Tribe': 'American Indian / Alaska Native',
 'Hispanic / Latino - Salvadoran': 'Hispanic or Latino',
 'Hispanic / Latino - Central American (oth

In [6]:
# since I have to fix some of the data issues in python, I need to get frequencies in python
def extractFrequencies(df):
    # first group the newly cleaned data
    df = df.groupby(['subject_id','label', 'value']).sum().reset_index()
    
    # now compute frequencies
    total_counts = df.groupby(['subject_id','label']).sum().reset_index()
    total_counts.columns = ['subject_id', 'label', 'total_counts']
    df = pd.merge(df, total_counts, on = ['subject_id', 'label'], how = 'left')
    df['frequency'] = df.num_occurences/df.total_counts
    return df

## Cross Sectional Data
Extract data for each group: Physical assessment, activity, pain, diet, demographics, heart, lung, and patient medical history.

we use multiple correspondence analysis to convert categorical to numeric  

### Physical Assessment

In [4]:
phys_assess = clinical_variables['chart events']['physical assessment']
phys_assess

['Abdominal Assessment',
 'Skin Color',
 'Skin Condition',
 'Speech',
 'Gag Reflex',
 'Cough Reflex',
 'Oral Cavity',
 'Bowel Sounds',
 'Braden Moisture']

In [20]:
# Get the frequency of each label and value for each patient
q = """SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Abdominal Assessment','Skin Color', 'Skin Condition', 'Speech',
 'Gag Reflex', 'Cough Reflex', 'Oral Cavity', 'Bowel Sounds', 'Braden Moisture')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences
"""

phys_assess = pandas_gbq.read_gbq(q)
phys_assess.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Abdominal Assessment,Soft,5
1,4,Bowel Sounds,Present,5
2,4,Braden Moisture,Occ. Moist,2
3,4,Braden Moisture,Rarely Moist,4
4,4,Oral Cavity,Teeth/Tissue WNL,9


In [25]:
# clean our data and extract frequencies
phys_assess = phys_assess.replace({"value":chartMap})
phys_assess = extractFrequencies(phys_assess)
phys_assess.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Abdominal Assessment,Soft,5,5,1.0
1,4,Bowel Sounds,Present,5,5,1.0
2,4,Braden Moisture,Occasionally Moist,2,6,0.333333
3,4,Braden Moisture,Rarely Moist,4,6,0.666667
4,4,Oral Cavity,Teeth/Tissue WNL,9,9,1.0


In [26]:
phys_assess.to_csv("../data/patientData/chart events_categorical_physical assessment.csv", index = False)

## Activity

In [7]:
activity = clinical_variables['chart events']['activity']
activity

['Activity', 'Braden Mobility', 'Braden Activity', 'Activity Tolerance']

In [8]:
q = """
SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Activity', 'Braden Mobility', 'Braden Activity', 'Activity Tolerance')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
activity = pandas_gbq.read_gbq(q)
activity.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Activity,Ambulate,1
1,4,Activity,Bedrest,5
2,4,Activity,Commode,4
3,4,Activity Tolerance,Good,3
4,4,Activity Tolerance,Tolerated Well,7


In [10]:
activity = activity.replace({"value":chartMap})
activity = extractFrequencies(activity)
activity.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Activity,Ambulate,1,10,0.1
1,4,Activity,Bedrest,5,10,0.5
2,4,Activity,Commode,4,10,0.4
3,4,Activity Tolerance,Good,3,10,0.3
4,4,Activity Tolerance,Tolerated Well,7,10,0.7


In [11]:
activity.to_csv("../data/patientData/chart events_categorical_activity.csv", index = False)

## Pain

In [12]:
pain = clinical_variables['chart events']['pain']
pain

['Pain Present', 'Pain Location', 'Pain Type', 'Pain Level', 'Pain Cause']

In [16]:
# we are going to pain a little differently
# cross-sectional of pain descriptions (location, cause, present, and type)
q = """SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Pain Present', 'Pain Location', 'Pain Type', 'Pain Cause')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
pain = pandas_gbq.read_gbq(q)
pain.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Pain Location,Not Indicated,8
1,4,Pain Present,No,11
2,52,Pain Cause,At Rest,2
3,52,Pain Location,Abdominal,1
4,52,Pain Location,Headache,5


In [18]:
pain = pain.replace({"value":chartMap})
pain = extractFrequencies(pain)
pain.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Pain Location,Not Indicated,8,8,1.0
1,4,Pain Present,No,11,11,1.0
2,52,Pain Cause,At Rest,2,2,1.0
3,52,Pain Location,Abdominal,1,9,0.111111
4,52,Pain Location,Headache,5,9,0.555556


In [19]:
pain.to_csv("../data/patientData/chart events_categorical_pain.csv", index = False)

In [27]:
# now we separate pain level.  we make the columns the location of the pain, and we average out the pain for that location
q = """SELECT A.subject_id, A.CHARTTIME as date, 
max(case when B.LABEL = "Pain Location" then A.VALUE else null end) as location,
max(case when B.LABEL = "Pain Level" then A.VALUE else null end) as level
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Pain Location', 'Pain Level')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%" and lower(VALUE) != "unable to score"
group by subject_id, charttime
having location is not null and level is not null
order by subject_id, A.CHARTTIME """
pain_level = pandas_gbq.read_gbq(q)
pain_level.head()

Unnamed: 0,subject_id,date,location,level
0,52,2191-01-10 04:00:00,Headache,10-Worst
1,52,2191-01-10 08:00:00,Headache,10-Worst
2,52,2191-01-10 16:00:00,Headache,5-Moderate
3,52,2191-01-10 20:00:00,Abdominal,2-Mild
4,117,2133-11-18 23:00:00,Back,6-Mod to Severe


In [28]:
# convert to ordinal
pain_level = pain_level.replace({"level":painLevelMap})
pain_level.head()

Unnamed: 0,subject_id,date,location,level
0,52,2191-01-10 04:00:00,Headache,10
1,52,2191-01-10 08:00:00,Headache,10
2,52,2191-01-10 16:00:00,Headache,5
3,52,2191-01-10 20:00:00,Abdominal,2
4,117,2133-11-18 23:00:00,Back,6


In [30]:
# now we take the average by location and transpose
pain_level = pain_level.groupby(['subject_id', 'location'], as_index=False).mean()\
                    .pivot(index='subject_id', columns='location', values='level')

In [31]:
pain.to_csv("../data/patientData/chart events_numeric_pain level.csv", index = False)

## Diet and Nutrition

In [32]:
diet = clinical_variables['chart events']['diet']
diet

['Braden Nutrition',
 'Diet Type',
 'Daily Weight',
 'Previous WeightF',
 'Admit Wt',
 'Previous Weight',
 'Appetite',
 'Weight Change',
 'Special diet']

In [33]:
# categorical and numeric.  for numeric just doing max weight, min weight, and change
q = """
SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Braden Nutrition','Diet Type', 'Appetite','Special diet')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
diet = pandas_gbq.read_gbq(q)
diet.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Braden Nutrition,Excellent,3
1,4,Braden Nutrition,Prob. Inadequate,3
2,4,Diet Type,Diabetic,8
3,52,Braden Nutrition,Prob. Inadequate,7
4,52,Braden Nutrition,Very Poor,1


In [35]:
diet = diet.replace({"value":chartMap})
diet = extractFrequencies(diet)
diet.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Braden Nutrition,Excellent,3,6,0.5
1,4,Braden Nutrition,Probably Inadequate,3,6,0.5
2,4,Diet Type,Diabetic,8,8,1.0
3,52,Braden Nutrition,Probably Inadequate,7,8,0.875
4,52,Braden Nutrition,Very Poor,1,8,0.125


In [36]:
diet.to_csv("../data/patientData/chart events_categorical_diet.csv", index = False)

In [37]:
# now for the numeric
q = """SELECT A.subject_id, max(A.ValueNum) as max_wgt, min(A.ValueNum) as min_wgt, 
  max(A.ValueNum) - min(A.ValueNum) as wgt_change, 
  (max(A.ValueNum) - min(A.ValueNum))/max(A.ValueNum) as loss_perc
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Daily Weight','Previous WeightF','Admit Wt','Previous Weight')
and A.HADM_ID is not null and A.VALUE is not null
and A.ValueNum > 0
group by subject_id
order by subject_id """
weight = pandas_gbq.read_gbq(q)
weight.head()

Unnamed: 0,subject_id,max_wgt,min_wgt,wgt_change,loss_perc
0,4,53.599998,53.599998,0.0,0.0
1,52,81.300003,81.300003,0.0,0.0
2,78,72.400002,72.400002,0.0,0.0
3,117,108.099998,79.5,28.599998,0.26457
4,143,130.5,113.0,17.5,0.1341


In [38]:
weight.to_csv("../data/patientData/chart events_numeric_weight.csv", index = False)

## Demographics

In [39]:
demographics = clinical_variables['chart events']['demographics']
demographics

['Marital Status', 'Family Communication', 'Gender', 'Race']

In [48]:
# I need to merge categorical data from chart events and patients
# I then treat age as a numeric variable and it is excluded from the MCA
q = """SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Marital Status', 'Family Communication', 'Race')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
demographics = pandas_gbq.read_gbq(q)
demographics.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Family Communication,Family Called,1
1,4,Family Communication,Family Visited,2
2,4,Marital Status,S,1
3,52,Family Communication,Fam Talked to MD,1
4,52,Family Communication,Family Visited,4


In [49]:
demographics = demographics.replace({"value":chartMap})
demographics = extractFrequencies(demographics)
demographics.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Family Communication,Family Called,1,3,0.333333
1,4,Family Communication,Family Visited,2,3,0.666667
2,4,Marital Status,Single,1,1,1.0
3,52,Family Communication,Family Talked to MD,1,5,0.2
4,52,Family Communication,Family Visited,4,5,0.8


In [58]:
# Now we extract age and gender from the patient table 
q = """
select A.subject_id, A.gender, date_diff(Date(B.last_date), Date(A.dob), YEAR) as age, 
from `patient-similarity.mimic.patients` as A
left join (
  SELECT subject_id, max(dischtime) as last_date FROM `patient-similarity.mimic.admissions` 
  group by subject_id
) as B
using(subject_id)
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
order by subject_id"""
age_gender = pandas_gbq.read_gbq(q)
age_gender.head()

Unnamed: 0,subject_id,gender,age
0,4,F,48
1,52,M,39
2,78,M,49
3,117,F,50
4,140,M,53


In [59]:
# format to match categorical data
age_gender = age_gender.melt(id_vars = "subject_id")
age_gender.columns = ["subject_id", "label", "value"]
age_gender['num_occurences'] = 1
age_gender['total_counts'] = 1
age_gender['frequency'] = 1
age_gender.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,gender,F,1,1,1
1,52,gender,M,1,1,1
2,78,gender,M,1,1,1
3,117,gender,F,1,1,1
4,140,gender,M,1,1,1


In [60]:
demographics = pd.concat([demographics, age_gender], axis = 0, sort = False).sort_values("subject_id")
demographics.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Family Communication,Family Called,1,3,0.333333
1,4,Family Communication,Family Visited,2,3,0.666667
2,4,Marital Status,Single,1,1,1.0
2884,4,age,48,1,1,1.0
0,4,gender,F,1,1,1.0


In [96]:
demographics.to_csv("../data/patientData/chart events_categorical_demographics.csv", index = False)

## Heart and Lung
Combining heart and lung measurements and following the same format as diet. Numeric columns are separated and will be added back later 

In [65]:
heart = clinical_variables['chart events']['heart']
lung = clinical_variables['chart events']['lung']
heart_lung = heart + lung
heart_lung

['Heart Rate',
 'Heart Rhythm',
 'Respiratory Rate',
 'RUL Lung Sounds',
 'LLL Lung Sounds',
 'RLL Lung Sounds',
 'LUL Lung Sounds',
 'Respiratory Pattern',
 'Respiratory Effort']

In [93]:
# first we extract the categorical columns#
q = """SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Heart Rhythm', 'RUL Lung Sounds', 'LLL Lung Sounds', 'RLL Lung Sounds',
 'LUL Lung Sounds', 'Respiratory Pattern', 'Respiratory Effort')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
heart_lung = pandas_gbq.read_gbq(q)
heart_lung.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,4,Heart Rhythm,Normal Sinus,26
1,4,Heart Rhythm,Sinus Tachy,4
2,4,LLL Lung Sounds,Clear,1
3,4,LLL Lung Sounds,Crackles,4
4,4,LLL Lung Sounds,Diminished,4


In [94]:
heart_lung = heart_lung.replace({"value":chartMap})
heart_lung = extractFrequencies(heart_lung)
heart_lung.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,4,Heart Rhythm,Normal Sinus,26,30,0.866667
1,4,Heart Rhythm,ST (Sinus Tachycardia),4,30,0.133333
2,4,LLL Lung Sounds,Clear,1,9,0.111111
3,4,LLL Lung Sounds,Crackles,4,9,0.444444
4,4,LLL Lung Sounds,Diminished,4,9,0.444444


In [97]:
heart_lung.to_csv("../data/patientData/chart events_categorical_heart lung.csv", index = False)

In [99]:
# Now for numeric - min, max, difference, average for heart rate and respiratory rate
q = """SELECT A.subject_id, 
max(case when B.LABEL = "Heart Rate" then A.ValueNum else null end) as max_heart_rate, 
min(case when B.LABEL = "Heart Rate" then A.ValueNum else null end) as min_heart_rate, 
avg(case when B.LABEL = "Heart Rate" then A.ValueNum else null end) as avg_heart_rate, 

max(case when B.LABEL = 'Respiratory Rate' then A.ValueNum else null end) as max_resp_rate, 
min(case when B.LABEL = 'Respiratory Rate' then A.ValueNum else null end) as min_resp_rate, 
avg(case when B.LABEL = 'Respiratory Rate' then A.ValueNum else null end) as avg_resp_rate, 


FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Heart Rate','Respiratory Rate')
and A.HADM_ID is not null and A.VALUE is not null
and A.ValueNum > 0
group by subject_id
order by subject_id """
heart_lung_rate = pandas_gbq.read_gbq(q)
heart_lung_rate.head()

Unnamed: 0,subject_id,max_heart_rate,min_heart_rate,avg_heart_rate,max_resp_rate,min_resp_rate,avg_resp_rate
0,4,111.0,74.0,90.354839,32.0,18.0,25.333333
1,52,106.0,85.0,94.24,27.0,11.0,17.607843
2,78,86.0,56.0,64.681818,24.0,11.0,17.0
3,117,219.0,56.0,83.629191,34.0,9.0,21.513889
4,140,117.0,65.0,84.533333,28.0,8.0,18.911111


In [100]:
heart_lung_rate.to_csv("../data/patientData/chart events_numeric_heart lung rate.csv", index = False)

## Medical History

In [101]:
med_history = clinical_variables['chart events']['medical history']
med_history

['Past medical history',
 'CV - past medical history',
 'Mental status',
 'Recreational drug use']

In [102]:
# I need to convert the values of the medical history values to be the label and then the value can take on 0 or 1
# I just do it with two queries, and concatenate.  
# I then need to fillna with 0 for indicator matrix on this one
q = """
SELECT distinct A.subject_id, B.label, A.value, count(A.Value) as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Mental status', 'Recreational drug use')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, LABEL, VALUE
order by Subject_ID, LABEL, VALUE, num_occurences"""
mental_drug = pandas_gbq.read_gbq(q)
mental_drug.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,188,Mental status,Forgets limitations,45
1,188,Mental status,Oriented to own ability,3
2,188,Recreational drug use,0,11
3,236,Mental status,Oriented to own ability,8
4,236,Recreational drug use,0,1


In [110]:
q = """
SELECT distinct A.subject_id, A.value as label, 1 as values, 1 as num_occurences
FROM `patient-similarity.mimic.chartevents` as A
left join  `patient-similarity.mimic.d_items` as B
on a.itemid=b.itemid
where subject_id in (select subject_id from `patient-similarity.mimic.liver_pts`)
and LABEL in ('Past medical history', 'CV - past medical history')
and A.HADM_ID is not null and A.VALUE is not null
and lower(VALUE) not like "%other%"
group by Subject_ID, VALUE
order by Subject_ID, VALUE, num_occurences"""
med_history = pandas_gbq.read_gbq(q)
med_history.columns = ['subject_id', 'label', 'value', 'num_occurences']
med_history.head()

Unnamed: 0,subject_id,label,value,num_occurences
0,188,Diabetes - Insulin,1,1
1,188,Diabetes - Oral Agent,1,1
2,188,GI Bleed,1,1
3,188,HEMO or PD,1,1
4,188,Hepatitis,1,1


In [111]:
med_history = pd.concat([med_history, mental_drug], axis = 0).sort_values("subject_id")
med_history = extractFrequencies(med_history)
med_history.head()

Unnamed: 0,subject_id,label,value,num_occurences,total_counts,frequency
0,188,Diabetes - Insulin,1,1,1,1.0
1,188,Diabetes - Oral Agent,1,1,1,1.0
2,188,GI Bleed,1,1,1,1.0
3,188,HEMO or PD,1,1,1,1.0
4,188,Hepatitis,1,1,1,1.0


In [112]:
med_history.to_csv("../data/patientData/chart events_categorical_medical history.csv", index = False)