In [1]:
# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# To make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Show all columns when displaying dataframes
pd.set_option('display.max_columns', None)

In [2]:
csv_path = 'MIMIC II/DIAGNOSES_ICD.csv'
diagnoses = pd.read_csv(csv_path)

csv_path = 'MIMIC II/D_ICD_DIAGNOSES.csv'
dir_diagnoses = pd.read_csv(csv_path)

csv_path = 'MIMIC II/PROCEDURES_ICD.csv'
procedures = pd.read_csv(csv_path)

csv_path = 'MIMIC II/D_ICD_PROCEDURES.csv'
dir_procedures = pd.read_csv(csv_path)

In [3]:
diagnoses = diagnoses.drop(['ROW_ID', 'SEQ_NUM'], axis = 1)
dir_diagnoses = dir_diagnoses.drop(['ROW_ID', 'LONG_TITLE'], axis = 1)
procedures = procedures.drop(['ROW_ID', 'SEQ_NUM'], axis = 1)
dir_procedures = dir_procedures.drop(['ROW_ID', 'LONG_TITLE'], axis = 1)

In [4]:
diagnoses_new = pd.merge(diagnoses, dir_diagnoses, on = 'ICD9_CODE')
diagnoses_new = diagnoses_new.drop('ICD9_CODE', axis = 1)
diagnoses_new.columns = ['SUBJECT_ID', 'HADM_ID', 'DIAGNOSES_SHORT_TITLE']
print(diagnoses_new.head())
diagnoses_new.info()

   SUBJECT_ID  HADM_ID   DIAGNOSES_SHORT_TITLE
0         109   172335  Mal hyp kid w cr kid V
1         109   173633  Mal hyp kid w cr kid V
2         109   131345  Mal hyp kid w cr kid V
3         109   131376  Mal hyp kid w cr kid V
4         109   135923  Mal hyp kid w cr kid V
<class 'pandas.core.frame.DataFrame'>
Int64Index: 634709 entries, 0 to 634708
Data columns (total 3 columns):
SUBJECT_ID               634709 non-null int64
HADM_ID                  634709 non-null int64
DIAGNOSES_SHORT_TITLE    634709 non-null object
dtypes: int64(2), object(1)
memory usage: 19.4+ MB


In [5]:
procedures_new = pd.merge(procedures, dir_procedures, on = 'ICD9_CODE')
procedures_new = procedures_new.drop('ICD9_CODE', axis = 1)
procedures_new.columns = ['SUBJECT_ID', 'HADM_ID', 'PROCEDURES_SHORT_TITLE']
print(procedures_new.head())
procedures_new.info()

   SUBJECT_ID  HADM_ID   PROCEDURES_SHORT_TITLE
0       62641   154460  Insert intercostal cath
1       11143   101985  Insert intercostal cath
2        9736   160259  Insert intercostal cath
3        7546   147476  Insert intercostal cath
4       65535   178280  Insert intercostal cath
<class 'pandas.core.frame.DataFrame'>
Int64Index: 246178 entries, 0 to 246177
Data columns (total 3 columns):
SUBJECT_ID                246178 non-null int64
HADM_ID                   246178 non-null int64
PROCEDURES_SHORT_TITLE    246178 non-null object
dtypes: int64(2), object(1)
memory usage: 7.5+ MB


In [6]:
diagnoses_and_procedures = pd.merge(procedures_new, diagnoses_new, on = ['SUBJECT_ID', 'HADM_ID'])
print(diagnoses_and_procedures.head())
diagnoses_and_procedures.info()

   SUBJECT_ID  HADM_ID   PROCEDURES_SHORT_TITLE     DIAGNOSES_SHORT_TITLE
0       62641   154460  Insert intercostal cath          Surg compl-heart
1       62641   154460  Insert intercostal cath       Atrial fibrillation
2       62641   154460  Insert intercostal cath   Iatrogenic pneumothorax
3       62641   154460  Insert intercostal cath   Mal neo upper lobe lung
4       62641   154460  Insert intercostal cath  Hx of bladder malignancy
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3167189 entries, 0 to 3167188
Data columns (total 4 columns):
SUBJECT_ID                int64
HADM_ID                   int64
PROCEDURES_SHORT_TITLE    object
DIAGNOSES_SHORT_TITLE     object
dtypes: int64(2), object(2)
memory usage: 120.8+ MB


In [7]:
# Create a dictionary of diagnoses and the procedures they correspond to, convert into dataframe
#mydict = {k: g["PROCEDURES_SHORT_TITLE"].tolist() for k,g in diagnoses_and_procedures.groupby("DIAGNOSES_SHORT_TITLE")}
#diagnoses_vs_procedures = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mydict.items() ]))
#diagnoses_vs_procedures = diagnoses_vs_procedures.fillna('No_Data')
#print(diagnoses_vs_procedures.info())
#diagnoses_vs_procedures.head()

In [8]:
#I dont think anything I just did matters, but I'll leave it

In [9]:
# Create a better dataframe for machine learning
diagnoses_and_procedures.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,PROCEDURES_SHORT_TITLE,DIAGNOSES_SHORT_TITLE
0,62641,154460,Insert intercostal cath,Surg compl-heart
1,62641,154460,Insert intercostal cath,Atrial fibrillation
2,62641,154460,Insert intercostal cath,Iatrogenic pneumothorax
3,62641,154460,Insert intercostal cath,Mal neo upper lobe lung
4,62641,154460,Insert intercostal cath,Hx of bladder malignancy


In [10]:
diagnoses_and_procedures = diagnoses_and_procedures.drop(['SUBJECT_ID', 'HADM_ID'], axis = 1)
diagnoses_and_procedures.head()

Unnamed: 0,PROCEDURES_SHORT_TITLE,DIAGNOSES_SHORT_TITLE
0,Insert intercostal cath,Surg compl-heart
1,Insert intercostal cath,Atrial fibrillation
2,Insert intercostal cath,Iatrogenic pneumothorax
3,Insert intercostal cath,Mal neo upper lobe lung
4,Insert intercostal cath,Hx of bladder malignancy


In [11]:
diagnoses = diagnoses_and_procedures.drop('PROCEDURES_SHORT_TITLE', axis = 1)
procedures = diagnoses_and_procedures.drop('DIAGNOSES_SHORT_TITLE', axis = 1)

In [None]:
# Encode into 1hot
running_total_columns = []
diagnoses_enc = diagnoses.copy()
    #create a clean copy of diagnoses to encode

diagnoses_1Hot = pd.get_dummies(diagnoses_enc['DIAGNOSES_SHORT_TITLE'])

print(procedures_1Hot.info())
diagnoses_1Hot.head()

In [None]:
# Encode into 1hot
running_total_columns = []
procedures_enc = procedures.copy()
    #create a clean copy of diagnoses to encode

procedures_1Hot = pd.get_dummies(procedures_enc['PROCEDURES_SHORT_TITLE'])

print(procedures_1Hot.info())
procedures_1Hot.head()