In [1]:
# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# To make this notebook's output stable across runs
rnd.seed(42)

# Show all columns when displaying dataframes
pd.set_option('display.max_columns', None)

import datetime
from dateutil.relativedelta import relativedelta

In [2]:
csv_path = 'MIMIC II/PATIENTS.csv'
patients = pd.read_csv(csv_path) #done
patients = patients.applymap(str)

csv_path = 'MIMIC II/ADMISSIONS.csv'
admissions = pd.read_csv(csv_path)
admissions.fillna('NoData', inplace = True)
admissions = admissions.applymap(str) #done

In [3]:
patients = patients.drop(['ROW_ID', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'GENDER', 'EXPIRE_FLAG'], axis = 1)
admissions = admissions[['SUBJECT_ID', 'ADMITTIME', 'HADM_ID']].copy()

In [4]:
data = pd.merge(admissions, patients, on = 'SUBJECT_ID')
data.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID,DOB
0,22,2196-04-09 12:26:00,165315,2131-05-07 00:00:00
1,23,2153-09-03 07:15:00,152223,2082-07-17 00:00:00
2,23,2157-10-18 19:34:00,124321,2082-07-17 00:00:00
3,24,2139-06-06 16:14:00,161859,2100-05-31 00:00:00
4,25,2160-11-02 02:06:00,129635,2101-11-21 00:00:00


In [5]:
DoB = data['DOB']
admDate = data['ADMITTIME']

In [6]:
def get_age (birth, current):
    
    #replace dashes and colons with spaces to make breaking the string up easier
    birth = birth.replace('-', ' ')
    birth = birth.replace(':', ' ')
    current = current.replace('-', ' ')
    current = current.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    birth = birth.split(' ')
    current = current.split(' ')
    
    #delete hour/min/sec, convert to integers
    birth = [int(i) for i in birth]
    current = [int(i) for i in current]
    
    #get age and return it
    birth = datetime.datetime(birth[0], birth[1], birth[2], birth[3], birth[4], birth[5])
    current = datetime.datetime(current[0], current[1], current[2], current[3], current[4], current[5])
    age = relativedelta(current, birth).years
    
    return age

In [7]:
def get_age_category(age):
    if age > 88:
        return ('89+')
    elif age > 60:
        return('Elderly')
    elif age > 50:
        return('50s')
    elif age > 40:
        return('40s')
    elif age > 30:
        return ('30s')
    elif age > 17:
        return ('20s')
    else:
        return ('Child')

In [16]:
ages = []
age_categories = []

for i in range(len(DoB)):
    ages.append(get_age(DoB[i], admDate[i]))

for age in ages:
    age_categories.append(get_age_category(age))

ages_df = pd.DataFrame({'AGE' : ages})
ages_cat_df = pd.DataFrame({'AGE_CAT' : age_categories})
data = pd.concat([data, ages_df], axis=1)
data = pd.concat([data, ages_cat_df], axis=1)

data.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID,DOB,AGE,AGE_CAT
0,22,2196-04-09 12:26:00,165315,2131-05-07 00:00:00,64,Elderly
1,23,2153-09-03 07:15:00,152223,2082-07-17 00:00:00,71,Elderly
2,23,2157-10-18 19:34:00,124321,2082-07-17 00:00:00,75,Elderly
3,24,2139-06-06 16:14:00,161859,2100-05-31 00:00:00,39,30s
4,25,2160-11-02 02:06:00,129635,2101-11-21 00:00:00,58,50s
