Python3 support

YerevaNN · Oct 31, 2017 · d1345ff · d1345ff
1 parent 0ce15bb
commit d1345ff
Show file tree

Hide file tree

Showing 12 changed files with 116 additions and 93 deletions.
diff --git a/mimic3benchmark/__init__.py b/mimic3benchmark/__init__.py
@@ -1,3 +1,3 @@
-import mimic3csv
-import subject
-import preprocessing
+import mimic3benchmark.mimic3csv
+import mimic3benchmark.subject
+import mimic3benchmark.preprocessing
diff --git a/mimic3benchmark/mimic3csv.py b/mimic3benchmark/mimic3csv.py
@@ -134,17 +134,21 @@ def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, i
     if subjects_to_keep is not None:
         subjects_to_keep = set([ str(s) for s in subjects_to_keep ])
 
-    class nonlocal: pass
-    nonlocal.curr_subject_id = ''
-    nonlocal.last_write_no = 0
-    nonlocal.last_write_nb_rows = 0
-    nonlocal.last_write_subject_id = ''
-    nonlocal.curr_obs = []
+    class DataStats(object):
+        def __init__(self):
+            self.curr_subject_id = ''
+            self.last_write_no = 0
+            self.last_write_nb_rows = 0
+            self.last_write_subject_id = ''
+            self.curr_obs = []
+
+    data_stats = DataStats()
+
     def write_current_observations():
-        nonlocal.last_write_no += 1
-        nonlocal.last_write_nb_rows = len(nonlocal.curr_obs)
-        nonlocal.last_write_subject_id = nonlocal.curr_subject_id
-        dn = os.path.join(output_path, str(nonlocal.curr_subject_id))
+        data_stats.last_write_no += 1
+        data_stats.last_write_nb_rows = len(data_stats.curr_obs)
+        data_stats.last_write_subject_id = data_stats.curr_subject_id
+        dn = os.path.join(output_path, str(data_stats.curr_subject_id))
         try:
             os.makedirs(dn)
         except:
@@ -155,17 +159,17 @@ def write_current_observations():
             f.write(','.join(obs_header) + '\n')
             f.close()
         w = csv.DictWriter(open(fn, 'a'), fieldnames=obs_header, quoting=csv.QUOTE_MINIMAL)
-        w.writerows(nonlocal.curr_obs)
-        nonlocal.curr_obs = []
+        w.writerows(data_stats.curr_obs)
+        data_stats.curr_obs = []
 
     for row, row_no, nb_rows in read_events_table_by_row(mimic3_path, table):
         if verbose and (row_no % 100000 == 0):
-            if nonlocal.last_write_no != '':
+            if data_stats.last_write_no != '':
                 sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write '
                                  '({3}) {4} rows for subject {5}'.format(table, row_no, nb_rows,
-                                                                         nonlocal.last_write_no,
-                                                                         nonlocal.last_write_nb_rows,
-                                                                         nonlocal.last_write_subject_id))
+                                                                         data_stats.last_write_no,
+                                                                         data_stats.last_write_nb_rows,
+                                                                         data_stats.last_write_subject_id))
             else:
                 sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...'.format(table, row_no, nb_rows))
 
@@ -181,17 +185,17 @@ def write_current_observations():
                     'ITEMID': row['ITEMID'],
                     'VALUE': row['VALUE'],
                     'VALUEUOM': row['VALUEUOM'] }
-        if nonlocal.curr_subject_id != '' and nonlocal.curr_subject_id != row['SUBJECT_ID']:
+        if data_stats.curr_subject_id != '' and data_stats.curr_subject_id != row['SUBJECT_ID']:
             write_current_observations()
-        nonlocal.curr_obs.append(row_out)
-        nonlocal.curr_subject_id = row['SUBJECT_ID']
+        data_stats.curr_obs.append(row_out)
+        data_stats.curr_subject_id = row['SUBJECT_ID']
 
-    if nonlocal.curr_subject_id != '':
+    if data_stats.curr_subject_id != '':
         write_current_observations()
 
     if verbose and (row_no % 100000 == 0):
         sys.stdout.write('\rprocessing {0}: ROW {1} of {2}...last write '
                          '({3}) {4} rows for subject {5}...DONE!\n'.format(table, row_no, nb_rows,
-                                                                 nonlocal.last_write_no,
-                                                                 nonlocal.last_write_nb_rows,
-                                                                 nonlocal.last_write_subject_id))
+                                                                 data_stats.last_write_no,
+                                                                 data_stats.last_write_nb_rows,
+                                                                 data_stats.last_write_subject_id))
diff --git a/mimic3benchmark/preprocessing.py b/mimic3benchmark/preprocessing.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import numpy as np
 import re
 
@@ -77,6 +79,9 @@ def make_phenotype_label_matrix(phenotypes, stays=None):
 
 def read_itemid_to_variable_map(fn, variable_column='LEVEL2'):
     var_map = DataFrame.from_csv(fn, index_col=None).fillna('').astype(str)
+
+    var_map.COUNT = var_map.COUNT.astype(int)
+
     var_map = var_map.ix[(var_map[variable_column] != '') & (var_map.COUNT>0)]
     var_map = var_map.ix[(var_map.STATUS == 'ready')]
     var_map.ITEMID = var_map.ITEMID.astype(int)
@@ -138,7 +143,7 @@ def clean_crr(df):
 # FIO2: many 0s, some 0<x<0.2 or 1<x<20
 def clean_fio2(df):
     v = df.VALUE.astype(float)
-    idx = df.VALUEUOM.fillna('').apply(lambda s: 'torr' not in s.lower()) & (df.VALUE>1.0)
+    idx = df.VALUEUOM.fillna('').apply(lambda s: 'torr' not in s.lower()) & (v>1.0)
     v.ix[idx] = v[idx] / 100.
     return v
 
@@ -212,13 +217,13 @@ def clean_height(df):
 }
 def clean_events(events):
     global cleaning_fns
-    for var_name, clean_fn in clean_fns.iteritems():
+    for var_name, clean_fn in clean_fns.items():
         idx = (events.VARIABLE == var_name)
         try:
             events.VALUE.ix[idx] = clean_fn(events.ix[idx])
-        except:
-            print "Exception in clean_events:", clean_fn.__name__
-            print "number of rows:", np.sum(idx)
-            print "values:", events.ix[idx]
+        except Exception as e:
+            print("Exception in clean_events:", clean_fn.__name__, e)
+            print("number of rows:", np.sum(idx))
+            print("values:", events.ix[idx])
             exit()
     return events.ix[events.VALUE.notnull()]
diff --git a/scripts/create_decompensation.py b/scripts/create_decompensation.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import os
 import argparse
 import numpy as np
@@ -24,10 +26,10 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
         os.mkdir(output_dir)
 
     xty_triples = []
-    patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
+    patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
     for (patient_index, patient) in enumerate(patients):
         patient_folder = os.path.join(args.root_path, partition, patient)
-        patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
+        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
         stays_df = pd.read_csv(os.path.join(patient_folder, "stays.csv"))
 
         for ts_filename in patient_ts_files:
@@ -43,7 +45,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
 
                 los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                 if (pd.isnull(los)):
-                    print "(length of stay is missing)", patient, ts_filename
+                    print("(length of stay is missing)", patient, ts_filename)
                     continue
 
                 stay = stays_df[stays_df.ICUSTAY_ID == label_df.iloc[0]['Icustay']]
@@ -52,7 +54,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
                 if (pd.isnull(deathtime)):
                     lived_time = 1e18
                 else:
-                    lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") -\
+                    lived_time = (datetime.strptime(deathtime, "%Y-%m-%d %H:%M:%S") -
                                   datetime.strptime(intime, "%Y-%m-%d %H:%M:%S")).total_seconds() / 3600.0
 
                 ts_lines = tsfile.readlines()
@@ -67,15 +69,15 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
 
                 # no measurements in ICU
                 if (len(ts_lines) == 0):
-                    print "(no events in ICU) ", patient, ts_filename
+                    print("(no events in ICU) ", patient, ts_filename)
                     continue
 
                 sample_times = np.arange(0.0, min(los, lived_time) + eps, sample_rate)
 
-                sample_times = filter(lambda x: x > shortest_length, sample_times)
+                sample_times = list(filter(lambda x: x > shortest_length, sample_times))
 
                 # At least one measurement
-                sample_times = filter(lambda x: x > event_times[0], sample_times)
+                sample_times = list(filter(lambda x: x > event_times[0], sample_times))
 
                 output_ts_filename = patient + "_" + ts_filename
                 with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
@@ -91,9 +93,9 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0,
                     xty_triples.append((output_ts_filename, t, cur_mortality))
 
         if ((patient_index + 1) % 100 == 0):
-            print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
+            print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))
 
-    print len(xty_triples)
+    print(len(xty_triples))
     if partition == "train":
         random.shuffle(xty_triples)
     if partition == "test":

diff --git a/scripts/create_in_hospital_mortality.py b/scripts/create_in_hospital_mortality.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import os
 import argparse
 import pandas as pd
@@ -20,10 +22,10 @@ def process_partition(partition, eps=1e-6, n_hours=48):
         os.mkdir(output_dir)
 
     xy_pairs = []
-    patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
+    patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
     for (patient_index, patient) in enumerate(patients):
         patient_folder = os.path.join(args.root_path, partition, patient)
-        patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
+        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
 
         for ts_filename in patient_ts_files:
             with open(os.path.join(patient_folder, ts_filename)) as tsfile:
@@ -37,7 +39,7 @@ def process_partition(partition, eps=1e-6, n_hours=48):
                 mortality = int(label_df.iloc[0]["Mortality"])
                 los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                 if (pd.isnull(los)):
-                    print "\n\t(length of stay is missing)", patient, ts_filename
+                    print("\n\t(length of stay is missing)", patient, ts_filename)
                     continue
 
                 if (los < n_hours - eps):
@@ -55,7 +57,7 @@ def process_partition(partition, eps=1e-6, n_hours=48):
 
                 # no measurements in ICU
                 if (len(ts_lines) == 0):
-                    print "\n\t(no events in ICU) ", patient, ts_filename
+                    print("\n\t(no events in ICU) ", patient, ts_filename)
                     continue
 
                 output_ts_filename = patient + "_" + ts_filename
@@ -67,9 +69,9 @@ def process_partition(partition, eps=1e-6, n_hours=48):
                 xy_pairs.append((output_ts_filename, mortality))
 
         if ((patient_index + 1) % 100 == 0):
-            print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
+            print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))
 
-    print "\n", len(xy_pairs)
+    print("\n", len(xy_pairs))
     if partition == "train":
         random.shuffle(xy_pairs)
     if partition == "test":

diff --git a/scripts/create_length_of_stay.py b/scripts/create_length_of_stay.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import os
 import argparse
 import numpy as np
@@ -21,10 +23,10 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
         os.mkdir(output_dir)
 
     xty_triples = []
-    patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
+    patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
     for (patient_index, patient) in enumerate(patients):
         patient_folder = os.path.join(args.root_path, partition, patient)
-        patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
+        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
 
         for ts_filename in patient_ts_files:
             with open(os.path.join(patient_folder, ts_filename)) as tsfile:
@@ -33,12 +35,12 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
 
                 # empty label file 
                 if (label_df.shape[0] == 0):
-                    print "\n\t(empty label file)", patient, ts_filename
+                    print("\n\t(empty label file)", patient, ts_filename)
                     continue
 
                 los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                 if (pd.isnull(los)):
-                    print "\n\t(length of stay is missing)", patient, ts_filename
+                    print("\n\t(length of stay is missing)", patient, ts_filename)
                     continue
 
                 ts_lines = tsfile.readlines()
@@ -53,15 +55,15 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
 
                 # no measurements in ICU
                 if (len(ts_lines) == 0):
-                    print "\n\t(no events in ICU) ", patient, ts_filename
+                    print("\n\t(no events in ICU) ", patient, ts_filename)
                     continue
 
                 sample_times = np.arange(0.0, los + eps, sample_rate)
 
-                sample_times = filter(lambda x: x > shortest_length, sample_times)
+                sample_times = list(filter(lambda x: x > shortest_length, sample_times))
 
                 # At least one measurement
-                sample_times = filter(lambda x: x > event_times[0], sample_times)
+                sample_times = list(filter(lambda x: x > event_times[0], sample_times))
 
                 output_ts_filename = patient + "_" + ts_filename
                 with open(os.path.join(output_dir, output_ts_filename), "w") as outfile:
@@ -73,9 +75,9 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4.0, eps=1e-6)
                     xty_triples.append((output_ts_filename, t, los - t))
 
         if ((patient_index + 1) % 100 == 0):
-            print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
+            print("\rprocessed {} / {} patients".format(patient_index + 1, len(patients)))
 
-    print len(xty_triples)
+    print(len(xty_triples))
     if partition == "train":
         random.shuffle(xty_triples)
     if partition == "test":

diff --git a/scripts/create_multitask.py b/scripts/create_multitask.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 import os
 import argparse
 import numpy as np
@@ -11,7 +13,7 @@
 parser = argparse.ArgumentParser(description="Create data for multitask prediction.")
 parser.add_argument('root_path', type=str, help="Path to root folder containing train and test sets.")
 parser.add_argument('output_path', type=str, help="Directory where the created data should be stored.")
-parser.add_argument('--phenotype_definitions', '-p', type=unicode, default='resources/hcup_ccs_2015_definitions.yaml',
+parser.add_argument('--phenotype_definitions', '-p', type=str, default='resources/hcup_ccs_2015_definitions.yaml',
                     help='YAML file with phenotype definitions.')
 args, _ = parser.parse_known_args()
 
@@ -57,11 +59,11 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
     swat_masks = []
     swat_labels = []
 
-    patients = filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition)))
+    patients = list(filter(str.isdigit, os.listdir(os.path.join(args.root_path, partition))))
 
     for (patient_index, patient) in enumerate(patients):
         patient_folder = os.path.join(args.root_path, partition, patient)
-        patient_ts_files = filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder))
+        patient_ts_files = list(filter(lambda x: x.find("timeseries") != -1, os.listdir(patient_folder)))
         stays_df = pd.read_csv(os.path.join(patient_folder, "stays.csv"))
 
         for ts_filename in patient_ts_files:
@@ -71,13 +73,13 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
 
                 # empty label file, skip globally
                 if (label_df.shape[0] == 0):
-                    print "\n\t(empty label file)", patient, ts_filename
+                    print("\n\t(empty label file)", patient, ts_filename)
                     continue
 
                 # find length of stay, skip globally if it is missing
                 los = 24.0 * label_df.iloc[0]['Length of Stay'] # in hours
                 if (pd.isnull(los)):
-                    print "\n\t(length of stay is missing)", patient, ts_filename
+                    print("\n\t(length of stay is missing)", patient, ts_filename)
                     continue
 
                 # find all event in ICU, skip globally if there is no event in ICU
@@ -91,7 +93,7 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
                                 if t > -eps and t < los - eps]
 
                 if (len(ts_lines) == 0):
-                    print "\n\t(no events in ICU) ", patient, ts_filename
+                    print("\n\t(no events in ICU) ", patient, ts_filename)
                     continue
 
                 # add length of stay
@@ -171,13 +173,13 @@ def process_partition(partition, sample_rate=1.0, shortest_length=4,
                 swat_labels.append(cur_swat_labels)
 
         if ((patient_index + 1) % 100 == 0):
-            print "\rprocessed %d / %d patients" % (patient_index + 1, len(patients)),
+            print("\rprocessed {} / {} patients" .format(patient_index + 1, len(patients)))
 
     def permute(arr, p):
         return [arr[index] for index in p]
 
     if partition == "train":
-        perm = range(len(filenames))
+        perm = list(range(len(filenames)))
         random.shuffle(perm)
     if partition == "test":
         perm = list(np.argsort(filenames))