Skip to content

Commit

Permalink
Python3 support
Browse files Browse the repository at this point in the history
  • Loading branch information
jgc128 committed Oct 31, 2017
1 parent 0ce15bb commit d1345ff
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 93 deletions.
6 changes: 3 additions & 3 deletions mimic3benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
import mimic3csv
import subject
import preprocessing
import mimic3benchmark.mimic3csv
import mimic3benchmark.subject
import mimic3benchmark.preprocessing
50 changes: 27 additions & 23 deletions mimic3benchmark/mimic3csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,17 +134,21 @@ def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, i
if subjects_to_keep is not None:
subjects_to_keep = set([ str(s) for s in subjects_to_keep ])

class nonlocal: pass
nonlocal.curr_subject_id = ''
nonlocal.last_write_no = 0
nonlocal.last_write_nb_rows = 0
nonlocal.last_write_subject_id = ''
nonlocal.curr_obs = []
class DataStats(object):
def __init__(self):
self.curr_subject_id = ''
self.last_write_no = 0
self.last_write_nb_rows = 0
self.last_write_subject_id = ''
self.curr_obs = []

data_stats = DataStats()

def write_current_observations():
nonlocal.last_write_no += 1
nonlocal.last_write_nb_rows = len(nonlocal.curr_obs)
nonlocal.last_write_subject_id = nonlocal.curr_subject_id
dn = os.path.join(output_path, str(nonlocal.curr_subject_id))
data_stats.last_write_no += 1
data_stats.last_write_nb_rows = len(data_stats.curr_obs)
data_stats.last_write_subject_id = data_stats.curr_subject_id
dn = os.path.join(output_path, str(data_stats.curr_subject_id))
try:
os.makedirs(dn)
except:
Expand All @@ -155,17 +159,17 @@ def write_current_observations():
f.write(','.join(obs_header) + '\n')
f.close()
w = csv.DictWriter(open(fn, 'a'), fieldnames=obs_header, quoting=csv.QUOTE_MINIMAL)
w.writerows(nonlocal.curr_obs)
nonlocal.curr_obs = []
w.writerows(data_stats.curr_obs)
data_stats.curr_obs = []

for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table):
if verbose and (row_no % 100000 == 0):
if nonlocal.last_write_no != '':
if data_stats.last_write_no != '':
sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write '
'({3}) {4} rows for subject {5}'.format(table, row_no, nb_rows,
nonlocal.last_write_no,
nonlocal.last_write_nb_rows,
nonlocal.last_write_subject_id))
data_stats.last_write_no,
data_stats.last_write_nb_rows,
data_stats.last_write_subject_id))
else:
sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...'.format(table, row_no, nb_rows))

Expand All @@ -181,17 +185,17 @@ def write_current_observations():
'ITEMID': row['ITEMID'],
'VALUE': row['VALUE'],
'VALUEUOM': row['VALUEUOM'] }
if nonlocal.curr_subject_id != '' and nonlocal.curr_subject_id != row['SUBJECT_ID']:
if data_stats.curr_subject_id != '' and data_stats.curr_subject_id != row['SUBJECT_ID']:
write_current_observations()
nonlocal.curr_obs.append(row_out)
nonlocal.curr_subject_id = row['SUBJECT_ID']
data_stats.curr_obs.append(row_out)
data_stats.curr_subject_id = row['SUBJECT_ID']

if nonlocal.curr_subject_id != '':
if data_stats.curr_subject_id != '':
write_current_observations()

if verbose and (row_no % 100000 == 0):
sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write '
'({3}) {4} rows for subject {5}...DONE!\n'.format(table, row_no, nb_rows,
nonlocal.last_write_no,
nonlocal.last_write_nb_rows,
nonlocal.last_write_subject_id))
data_stats.last_write_no,
data_stats.last_write_nb_rows,
data_stats.last_write_subject_id))
17 changes: 11 additions & 6 deletions mimic3benchmark/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function

import numpy as np
import re

Expand Down Expand Up @@ -77,6 +79,9 @@ def make_phenotype_label_matrix(phenotypes, stays=None):

def read_itemid_to_variable_map(fn, variable_column='LEVEL2'):
var_map = DataFrame.from_csv(fn, index_col=None).fillna('').astype(str)

var_map.COUNT = var_map.COUNT.astype(int)

var_map = var_map.ix[(var_map[variable_column] != '') & (var_map.COUNT>0)]
var_map = var_map.ix[(var_map.STATUS == 'ready')]
var_map.ITEMID = var_map.ITEMID.astype(int)
Expand Down Expand Up @@ -138,7 +143,7 @@ def clean_crr(df):
# FIO2: many 0s, some 0<x<0.2 or 1<x<20
def clean_fio2(df):
v = df.VALUE.astype(float)
idx = df.VALUEUOM.fillna('').apply(lambda s: 'torr' not in s.lower()) & (df.VALUE>1.0)
idx = df.VALUEUOM.fillna('').apply(lambda s: 'torr' not in s.lower()) & (v>1.0)
v.ix[idx] = v[idx] / 100.
return v

Expand Down Expand Up @@ -212,13 +217,13 @@ def clean_height(df):
}
def clean_events(events):
global cleaning_fns
for var_name, clean_fn in clean_fns.iteritems():
for var_name, clean_fn in clean_fns.items():
idx = (events.VARIABLE == var_name)
try:
events.VALUE.ix[idx] = clean_fn(events.ix[idx])
except:
print "Exception in clean_events:", clean_fn.__name__
print "number of rows:", np.sum(idx)
print "values:", events.ix[idx]
except Exception as e:
print("Exception in clean_events:", clean_fn.__name__, e)
print("number of rows:", np.sum(idx))
print("values:", events.ix[idx])
exit()
return events.ix[events.VALUE.notnull()]
20 changes: 11 additions & 9 deletions scripts/create_decompensation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function

import os
import argparse
import numpy as np
Expand All @@ -24,10 +26,10 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
os.mkdir(output_dir)

xty_triples = []
patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
for (patient_index, patient) in enumerate(patients):
patient_folder = os.path.join(args.root_path, partition, patient)
patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
stays_df = pd.read_csv(os.path.join(patient_folder, "stays.csv"))

for ts_filename in patient_ts_files:
Expand All @@ -43,7 +45,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,

los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
if (pd.isnull(los)):
print "(length of stay is missing)", patient, ts_filename
print("(length of stay is missing)", patient, ts_filename)
continue

stay = stays_df[stays_df.ICUSTAY_ID == label_df.iloc[0]['Icustay']]
Expand All @@ -52,7 +54,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
if (pd.isnull(deathtime)):
lived_time = 1e18
else:
lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") -\
lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") -
datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0

ts_lines = tsfile.readlines()
Expand All @@ -67,15 +69,15 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,

# no measurements in ICU
if (len(ts_lines) == 0):
print "(no events in ICU) ", patient, ts_filename
print("(no events in ICU) ", patient, ts_filename)
continue

sample_times = np.arange(0.0, min(los, lived_time) + eps, sample_rate)

sample_times = filter(lambda x: x > shortest_length, sample_times)
sample_times = list(filter(lambda x: x > shortest_length, sample_times))

# At least one measurement
sample_times = filter(lambda x: x > event_times[0], sample_times)
sample_times = list(filter(lambda x: x > event_times[0], sample_times))

output_ts_filename = patient + "_" + ts_filename
with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
Expand All @@ -91,9 +93,9 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
xty_triples.append((output_ts_filename, t, cur_mortality))

if ((patient_index + 1) % 100 == 0):
print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))

print len(xty_triples)
print(len(xty_triples))
if partition == "train":
random.shuffle(xty_triples)
if partition == "test":
Expand Down
14 changes: 8 additions & 6 deletions scripts/create_in_hospital_mortality.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function

import os
import argparse
import pandas as pd
Expand All @@ -20,10 +22,10 @@ def process_partition(partition, eps=1e-6, n_hours=48):
os.mkdir(output_dir)

xy_pairs = []
patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
for (patient_index, patient) in enumerate(patients):
patient_folder = os.path.join(args.root_path, partition, patient)
patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))

for ts_filename in patient_ts_files:
with open(os.path.join(patient_folder, ts_filename)) as tsfile:
Expand All @@ -37,7 +39,7 @@ def process_partition(partition, eps=1e-6, n_hours=48):
mortality = int(label_df.iloc[0]["Mortality"])
los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
if (pd.isnull(los)):
print "\n\t(length of stay is missing)", patient, ts_filename
print("\n\t(length of stay is missing)", patient, ts_filename)
continue

if (los < n_hours - eps):
Expand All @@ -55,7 +57,7 @@ def process_partition(partition, eps=1e-6, n_hours=48):

# no measurements in ICU
if (len(ts_lines) == 0):
print "\n\t(no events in ICU) ", patient, ts_filename
print("\n\t(no events in ICU) ", patient, ts_filename)
continue

output_ts_filename = patient + "_" + ts_filename
Expand All @@ -67,9 +69,9 @@ def process_partition(partition, eps=1e-6, n_hours=48):
xy_pairs.append((output_ts_filename, mortality))

if ((patient_index + 1) % 100 == 0):
print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))

print "\n", len(xy_pairs)
print("\n", len(xy_pairs))
if partition == "train":
random.shuffle(xy_pairs)
if partition == "test":
Expand Down
20 changes: 11 additions & 9 deletions scripts/create_length_of_stay.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function

import os
import argparse
import numpy as np
Expand All @@ -21,10 +23,10 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
os.mkdir(output_dir)

xty_triples = []
patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
for (patient_index, patient) in enumerate(patients):
patient_folder = os.path.join(args.root_path, partition, patient)
patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))

for ts_filename in patient_ts_files:
with open(os.path.join(patient_folder, ts_filename)) as tsfile:
Expand All @@ -33,12 +35,12 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)

# empty label file
if (label_df.shape[0] == 0):
print "\n\t(empty label file)", patient, ts_filename
print("\n\t(empty label file)", patient, ts_filename)
continue

los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
if (pd.isnull(los)):
print "\n\t(length of stay is missing)", patient, ts_filename
print("\n\t(length of stay is missing)", patient, ts_filename)
continue

ts_lines = tsfile.readlines()
Expand All @@ -53,15 +55,15 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)

# no measurements in ICU
if (len(ts_lines) == 0):
print "\n\t(no events in ICU) ", patient, ts_filename
print("\n\t(no events in ICU) ", patient, ts_filename)
continue

sample_times = np.arange(0.0, los + eps, sample_rate)

sample_times = filter(lambda x: x > shortest_length, sample_times)
sample_times = list(filter(lambda x: x > shortest_length, sample_times))

# At least one measurement
sample_times = filter(lambda x: x > event_times[0], sample_times)
sample_times = list(filter(lambda x: x > event_times[0], sample_times))

output_ts_filename = patient + "_" + ts_filename
with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
Expand All @@ -73,9 +75,9 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
xty_triples.append((output_ts_filename, t, los - t))

if ((patient_index + 1) % 100 == 0):
print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))

print len(xty_triples)
print(len(xty_triples))
if partition == "train":
random.shuffle(xty_triples)
if partition == "test":
Expand Down
18 changes: 10 additions & 8 deletions scripts/create_multitask.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function

import os
import argparse
import numpy as np
Expand All @@ -11,7 +13,7 @@
parser = argparse.ArgumentParser(description="Create data for multitask prediction.")
parser.add_argument('root_path', type=str, help="Path to root folder containing train and test sets.")
parser.add_argument('output_path', type=str, help="Directory where the created data should be stored.")
parser.add_argument('--phenotype_definitions', '-p', type=unicode, default='resources/hcup_ccs_2015_definitions.yaml',
parser.add_argument('--phenotype_definitions', '-p', type=str, default='resources/hcup_ccs_2015_definitions.yaml',
help='YAML file with phenotype definitions.')
args, _ = parser.parse_known_args()

Expand Down Expand Up @@ -57,11 +59,11 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
swat_masks = []
swat_labels = []

patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))

for (patient_index, patient) in enumerate(patients):
patient_folder = os.path.join(args.root_path, partition, patient)
patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
stays_df = pd.read_csv(os.path.join(patient_folder, "stays.csv"))

for ts_filename in patient_ts_files:
Expand All @@ -71,13 +73,13 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,

# empty label file, skip globally
if (label_df.shape[0] == 0):
print "\n\t(empty label file)", patient, ts_filename
print("\n\t(empty label file)", patient, ts_filename)
continue

# find length of stay, skip globally if it is missing
los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
if (pd.isnull(los)):
print "\n\t(length of stay is missing)", patient, ts_filename
print("\n\t(length of stay is missing)", patient, ts_filename)
continue

# find all event in ICU, skip globally if there is no event in ICU
Expand All @@ -91,7 +93,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
if t > -eps and t < los - eps]

if (len(ts_lines) == 0):
print "\n\t(no events in ICU) ", patient, ts_filename
print("\n\t(no events in ICU) ", patient, ts_filename)
continue

# add length of stay
Expand Down Expand Up @@ -171,13 +173,13 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
swat_labels.append(cur_swat_labels)

if ((patient_index + 1) % 100 == 0):
print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
print("\rprocessed {} / {} patients" .format(patient_index + 1, len(patients)))

def permute(arr, p):
return [arr[index] for index in p]

if partition == "train":
perm = range(len(filenames))
perm = list(range(len(filenames)))
random.shuffle(perm)
if partition == "test":
perm = list(np.argsort(filenames))
Expand Down

0 comments on commit d1345ff

Please sign in to comment.