In [1]:
# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# To make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Show all columns when displaying dataframes
pd.set_option('display.max_columns', None)

In [2]:
csv_path = 'MIMIC II/DIAGNOSES_ICD.csv'
diagnoses = pd.read_csv(csv_path)

csv_path = 'MIMIC II/D_ICD_DIAGNOSES.csv'
dir_diagnoses = pd.read_csv(csv_path)

csv_path = 'MIMIC II/PROCEDURES_ICD.csv'
procedures = pd.read_csv(csv_path)

csv_path = 'MIMIC II/D_ICD_PROCEDURES.csv'
dir_procedures = pd.read_csv(csv_path)

csv_path = 'MIMIC II/ADMISSIONS.csv'
admissions = pd.read_csv(csv_path)

#csv_path = 'MIMIC II/NOTEEVENTS.csv'
#noteevents = pd.read_csv(csv_path)

In [3]:
diagnoses = diagnoses.drop(['ROW_ID', 'SEQ_NUM'], axis = 1)
dir_diagnoses = dir_diagnoses.drop(['ROW_ID', 'LONG_TITLE'], axis = 1)
procedures = procedures.drop(['ROW_ID', 'SEQ_NUM'], axis = 1)
dir_procedures = dir_procedures.drop(['ROW_ID', 'LONG_TITLE'], axis = 1)
admissions = admissions.drop(['ROW_ID', 'DEATHTIME', 'ADMISSION_TYPE', 'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'HAS_CHARTEVENTS_DATA'], axis = 1)

In [4]:
diagnoses_new = pd.merge(diagnoses, dir_diagnoses, on = 'ICD9_CODE')
diagnoses_new = diagnoses_new.drop('ICD9_CODE', axis = 1)
diagnoses_new.columns = ['SUBJECT_ID', 'HADM_ID', 'DIAGNOSES_SHORT_TITLE']
print(diagnoses_new.head())
diagnoses_new.info()

   SUBJECT_ID  HADM_ID   DIAGNOSES_SHORT_TITLE
0         109   172335  Mal hyp kid w cr kid V
1         109   173633  Mal hyp kid w cr kid V
2         109   131345  Mal hyp kid w cr kid V
3         109   131376  Mal hyp kid w cr kid V
4         109   135923  Mal hyp kid w cr kid V
<class 'pandas.core.frame.DataFrame'>
Int64Index: 634709 entries, 0 to 634708
Data columns (total 3 columns):
SUBJECT_ID               634709 non-null int64
HADM_ID                  634709 non-null int64
DIAGNOSES_SHORT_TITLE    634709 non-null object
dtypes: int64(2), object(1)
memory usage: 19.4+ MB


In [5]:
procedures_new = pd.merge(procedures, dir_procedures, on = 'ICD9_CODE')
procedures_new = procedures_new.drop('ICD9_CODE', axis = 1)
procedures_new.columns = ['SUBJECT_ID', 'HADM_ID', 'PROCEDURES_SHORT_TITLE']
print(procedures_new.head())
procedures_new.info()

   SUBJECT_ID  HADM_ID   PROCEDURES_SHORT_TITLE
0       62641   154460  Insert intercostal cath
1       11143   101985  Insert intercostal cath
2        9736   160259  Insert intercostal cath
3        7546   147476  Insert intercostal cath
4       65535   178280  Insert intercostal cath
<class 'pandas.core.frame.DataFrame'>
Int64Index: 246178 entries, 0 to 246177
Data columns (total 3 columns):
SUBJECT_ID                246178 non-null int64
HADM_ID                   246178 non-null int64
PROCEDURES_SHORT_TITLE    246178 non-null object
dtypes: int64(2), object(1)
memory usage: 7.5+ MB


In [6]:
admissions.fillna('NoData', inplace = True)
admissions = admissions.applymap(str)
#word_list = ['EMERGENCY ROOM ADMIT']
#admissions = admissions[admissions['ADMISSION_LOCATION'].str.contains('|'.join(word_list))]
admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 9 columns):
SUBJECT_ID              58976 non-null object
HADM_ID                 58976 non-null object
ADMITTIME               58976 non-null object
DISCHTIME               58976 non-null object
ADMISSION_LOCATION      58976 non-null object
EDREGTIME               58976 non-null object
EDOUTTIME               58976 non-null object
DIAGNOSIS               58976 non-null object
HOSPITAL_EXPIRE_FLAG    58976 non-null object
dtypes: object(9)
memory usage: 4.0+ MB


In [7]:
# Search through dataframe and return rows with specific values
word_list = ['ACUTE MYOCARDIAL INFARCTION', 'ACUTE MI']
STEMI_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
STEMI_diagnosis.info()
    #351 STEMI (91 emergency room admit)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 19 to 58070
Data columns (total 9 columns):
SUBJECT_ID              351 non-null object
HADM_ID                 351 non-null object
ADMITTIME               351 non-null object
DISCHTIME               351 non-null object
ADMISSION_LOCATION      351 non-null object
EDREGTIME               351 non-null object
EDOUTTIME               351 non-null object
DIAGNOSIS               351 non-null object
HOSPITAL_EXPIRE_FLAG    351 non-null object
dtypes: object(9)
memory usage: 27.4+ KB


In [8]:
word_list = ['STROKE', 'CVA', 'CEREBROVASCULAR ACCIDENT']
STROKE_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
    #836 TOTAL STROKE
word_list = ['TIA', 'TRANSIENT ISCHEMIC ATTACK']
TIA_diagnosis = STROKE_diagnosis[STROKE_diagnosis['DIAGNOSIS'].str.contains('|'.join(word_list))]
    #560 TOTAL TIA
word_list = TIA_diagnosis['SUBJECT_ID'].tolist()
STROKE_diagnosis = STROKE_diagnosis.set_index(STROKE_diagnosis['SUBJECT_ID'])
ISCHEMIC_STROKE_diagnosis = STROKE_diagnosis.drop(word_list)
    #458 TOTAL ISCHEMIC STROKE (309 emergency room admit)
ISCHEMIC_STROKE_diagnosis.info()

<class 'pandas.core.frame.DataFrame'>
Index: 458 entries, 360 to 94329
Data columns (total 9 columns):
SUBJECT_ID              458 non-null object
HADM_ID                 458 non-null object
ADMITTIME               458 non-null object
DISCHTIME               458 non-null object
ADMISSION_LOCATION      458 non-null object
EDREGTIME               458 non-null object
EDOUTTIME               458 non-null object
DIAGNOSIS               458 non-null object
HOSPITAL_EXPIRE_FLAG    458 non-null object
dtypes: object(9)
memory usage: 35.8+ KB


In [9]:
word_list = ['PULMONARY EMBOLISM']
PULM_EMB_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
PULM_EMB_diagnosis.info()
    #89 TOTAL PULMONARY EMBOLISM (45 emergency room admit)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89 entries, 3746 to 58721
Data columns (total 9 columns):
SUBJECT_ID              89 non-null object
HADM_ID                 89 non-null object
ADMITTIME               89 non-null object
DISCHTIME               89 non-null object
ADMISSION_LOCATION      89 non-null object
EDREGTIME               89 non-null object
EDOUTTIME               89 non-null object
DIAGNOSIS               89 non-null object
HOSPITAL_EXPIRE_FLAG    89 non-null object
dtypes: object(9)
memory usage: 7.0+ KB


In [10]:
word_list = ['ARTERY DISEASE']
ARTERY_D_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
ARTERY_D_diagnosis.info()
    #3364 ARTERY DISEASE (62 emergency room admit)
word_list = ['ARTERY DISEASE', 'CATH']
ARTERY_CATH_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
ARTERY_CATH_diagnosis.info()
    #5255 ARTERY DISEASE / CATH (169 emergency room admit)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3364 entries, 1 to 58954
Data columns (total 9 columns):
SUBJECT_ID              3364 non-null object
HADM_ID                 3364 non-null object
ADMITTIME               3364 non-null object
DISCHTIME               3364 non-null object
ADMISSION_LOCATION      3364 non-null object
EDREGTIME               3364 non-null object
EDOUTTIME               3364 non-null object
DIAGNOSIS               3364 non-null object
HOSPITAL_EXPIRE_FLAG    3364 non-null object
dtypes: object(9)
memory usage: 262.8+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5255 entries, 1 to 58954
Data columns (total 9 columns):
SUBJECT_ID              5255 non-null object
HADM_ID                 5255 non-null object
ADMITTIME               5255 non-null object
DISCHTIME               5255 non-null object
ADMISSION_LOCATION      5255 non-null object
EDREGTIME               5255 non-null object
EDOUTTIME               5255 non-null object
DIAGNOSIS               

In [11]:
word_list = ['VEIN THROMBOSIS', 'DVT']
DVT_diagnosis = admissions[admissions['DIAGNOSIS'].str.contains('|'.join(word_list))]
DVT_diagnosis.info()
    #71 DVT (26 emergency room admit)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71 entries, 31 to 58716
Data columns (total 9 columns):
SUBJECT_ID              71 non-null object
HADM_ID                 71 non-null object
ADMITTIME               71 non-null object
DISCHTIME               71 non-null object
ADMISSION_LOCATION      71 non-null object
EDREGTIME               71 non-null object
EDOUTTIME               71 non-null object
DIAGNOSIS               71 non-null object
HOSPITAL_EXPIRE_FLAG    71 non-null object
dtypes: object(9)
memory usage: 5.5+ KB
