## Getting age and gender data

Pulling the age and gender data is fairly simple. Grab the list of ventilated patiends ('sample_vents.csv'),as well as the patient dataset ('patients.csv') and join tables on subject id

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### get feature dataset

In [2]:
df = pd.read_csv('../data/processed/sample_vents.csv')

#### load patient dataset containing age and sex

In [3]:
patients = pd.read_csv('../data/raw/patients.csv')

#### get only patients who have been ventilalted from patient data set

In [4]:
ids = df['subject_id']
vented_patients = patients[patients['subject_id'].isin(ids)]

In [5]:
vented_patients

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
7677,10004235,M,47,2196,2014 - 2016,
7684,10005348,M,76,2128,2011 - 2013,
7731,10019003,F,65,2148,2011 - 2013,
7796,10035631,M,63,2112,2011 - 2013,
7800,10035747,M,91,2126,2008 - 2010,
...,...,...,...,...,...,...
383051,19962250,M,58,2125,2014 - 2016,
383066,19965144,M,77,2184,2008 - 2010,
383073,19965610,F,77,2125,2014 - 2016,
383086,19970491,M,55,2129,2008 - 2010,


In [6]:
dup_hadms = df['subject_id'].value_counts()

In [7]:
dup_hadms

12606543    13
12067437    10
17949897    10
11281568    10
15131736    10
            ..
10706853     1
18615099     1
18219834     1
19653430     1
16750595     1
Name: subject_id, Length: 14900, dtype: int64

In [8]:
df[df['subject_id'].isin(dup_hadms.index[dup_hadms >10])]

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,hours_to_death
6171,27235018,2154-01-24 17:16:00,232.766667,0,12606543,2154-01-14 23:09:00,2154-01-24 16:30:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2154-01-14 20:35:00,2154-01-15 00:22:00,0,
6176,25854102,2154-01-29 05:20:00,61.333333,0,12606543,2154-01-26 13:50:00,2154-01-29 11:55:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2154-01-26 09:12:00,2154-01-26 14:30:00,0,
6235,24846597,2154-05-25 13:02:00,35.483333,0,12606543,2154-05-20 20:10:00,2154-05-29 14:41:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2154-05-20 16:10:00,2154-05-20 21:32:00,0,
6248,20377647,2154-06-18 22:01:00,47.516667,0,12606543,2154-06-16 20:26:00,2154-06-20 16:45:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2154-06-16 13:22:00,2154-06-16 22:17:00,0,
6254,27823598,2154-07-02 18:06:00,49.733333,0,12606543,2154-06-30 14:37:00,2154-07-02 18:00:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2154-06-30 13:07:00,2154-06-30 16:13:00,0,
6350,20171304,2155-01-21 16:16:00,25.05,0,12606543,2155-01-20 03:48:00,2155-01-27 22:18:00,,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2155-01-20 02:09:00,2155-01-20 06:26:00,0,
6493,26428238,2155-12-24 11:52:00,19.55,0,12606543,2155-12-23 14:34:00,2155-12-24 17:26:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2155-12-23 12:32:00,2155-12-23 16:12:00,0,
6495,23041018,2156-01-07 17:23:00,232.8,0,12606543,2155-12-27 17:44:00,2156-01-07 17:00:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2155-12-27 15:15:00,2155-12-27 18:55:00,0,
6583,25922225,2156-07-02 06:30:00,7.0,0,12606543,2156-07-01 20:18:00,2156-07-02 17:45:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2156-07-01 12:48:00,2156-07-01 22:01:00,0,
6637,21200268,2156-10-20 16:54:00,19.1,0,12606543,2156-10-19 17:59:00,2156-10-20 16:53:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2156-10-19 13:05:00,2156-10-19 19:48:00,0,


#### join age and gender to main feature table and save to separate feather files

In [9]:
age = pd.merge(left = df, right=vented_patients, how = 'left', left_on='subject_id', right_on='subject_id')

In [10]:
age.to_feather('../data/processed/anchor_age')
age.to_feather('../data/processed/gender')

In [11]:
#new_df.to_csv('all_feature_table.csv',index= False)