In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

In [1]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
import sys
sys.path.append("general")

orders = ['Hours', 'Capillary refill rate', 'Diastolic blood pressure',
       'Fraction inspired oxygen', 'Glascow coma scale eye opening',
       'Glascow coma scale motor response', 'Glascow coma scale total',
       'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height',
       'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate',
       'Systolic blood pressure', 'Temperature', 'Weight', 'pH']

count = pd.read_csv('../mimic3-benchmarks/mimic3benchmark/resources/variable_ranges.csv', sep=',')
limit = []
cat_name = ["Glascow coma scale eye opening",
"Glascow coma scale motor response",
#"Glascow coma scale total",
"Glascow coma scale verbal response"]
for col in orders:
    if len(count[(count['LEVEL2']==col)])==1 and col not in cat_name:
        x = float(count[(count['LEVEL2']==col)]['OUTLIER LOW'])
        y = float(count[(count['LEVEL2']==col)]['OUTLIER HIGH'])
        limit.append((col, x, y))
        print(col, x, y)

refine_eye = {
     "None": "None",
     "1 No Response": "1 No Response",
     "2 To pain": "To Pain", 
     "To Pain": "To Pain",
     "3 To speech": "To Speech", 
     "To Speech": "To Speech",
     "4 Spontaneously": "Spontaneously",
     "Spontaneously": "Spontaneously"
}
refine_motor = {
    "1 No Response": "No response",
    "No response": "No response",
    "2 Abnorm extensn": "Abnormal extension",
    "Abnormal extension": "Abnormal extension",
    "3 Abnorm flexion": "Abnormal Flexion",
    "Abnormal Flexion": "Abnormal Flexion",
    "4 Flex-withdraws": "Flex-withdraws",
    "Flex-withdraws": "Flex-withdraws",
    "5 Localizes Pain": "Localizes Pain",
    "Localizes Pain": "Localizes Pain",
    "6 Obeys Commands": "Obeys Commands",
    "Obeys Commands": "Obeys Commands"
}
refine_verb = {
    "No Response-ETT": "No Response",
    "No Response": "No Response",
    "1 No Response": "No Response",
    "1.0 ET/Trach": "No Response",
    "2 Incomp sounds": "Incomprehensible sounds",
    "Incomprehensible sounds": "Incomprehensible sounds",
    "3 Inapprop words": "Inappropriate Words",
    "Inappropriate Words": "Inappropriate Words",
    "4 Confused": "Confused",
    "Confused": "Confused",
    "5 Oriented": "Oriented",
    "Oriented": "Oriented"
}

Capillary refill rate 0.0 1.0
Diastolic blood pressure 0.0 375.0
Fraction inspired oxygen 0.2 1.1
Glascow coma scale total 3.0 15.0
Glucose 0.0 2200.0
Height 0.0 275.0
Mean blood pressure 0.0 375.0
Oxygen saturation 0.0 150.0
Respiratory rate 0.0 330.0
Systolic blood pressure 0.0 375.0
Temperature 14.2 47.0
Weight 0.0 250.0
pH 6.3 10.0


In [2]:
def get_dataset(use_path, name_lis):
    data = []
    h = Counter()
    w = Counter()
    l = Counter()
    height = []
    drop_lis=[]
    for i,(fi,label) in enumerate(zip(name_lis["stay"],name_lis["y_true"])):
        df = pd.read_csv("{}/{}".format(use_path, fi), sep=',')
        if len(df)>150:
            l[label]+=1
            drop_lis.append(i)
            continue
            
        h[df["Height"].count()] += 1
        w[df["Weight"].count()] += 1
        """ 
        y = float("nan")
        for x in df["Height"].unique():
            if x==x:
                y = x
                break
        height.append(y)
        """
        if not df["Glascow coma scale eye opening"].isna().all():
            df["Glascow coma scale eye opening"].replace(refine_eye, inplace=True)
        if not df["Glascow coma scale motor response"].isna().all():
            df["Glascow coma scale motor response"].replace(refine_motor, inplace=True)
        if not df["Glascow coma scale verbal response"].isna().all():
            df["Glascow coma scale verbal response"].replace(refine_verb, inplace=True)
        df.drop(columns = ["Capillary refill rate"], inplace=True)
        data.append(df)
        
    name_lis.drop(labels=drop_lis, inplace=True)
    #name_lis["Height"] = np.array(height, dtype=object)
    print(len(name_lis))
    print(l)
    return data


In [3]:
raw = pickle.load(open("./data/inhospital/fullhos.pkl", "rb"))
#d_P = raw["dynamic_processor"] 
from stdprocessor import StdProcessor

def transform_dataset(data, name_lis):
    y = name_lis["y_true"].values.reshape(-1, 1)
    d_lis=[d_P.transform(ds) for ds in data]
    seq_len = [len(ds) for ds in data]
    d = [x[0].tolist() for x in d_lis]
    lag = [x[1].tolist() for x in d_lis]
    mask = [x[2].tolist() for x in d_lis]
    times = [x[-1].tolist() for x in d_lis]
    priv = [x[3].tolist() for x in d_lis]
    nex = [x[4].tolist() for x in d_lis]

    from fastNLP import DataSet
    dataset = DataSet({"seq_len": seq_len, 
                       "dyn": d, "lag":lag, "mask": mask,
                       "label": y, "times":times, "priv":priv, "nex": nex
                      })
    dataset.set_input("dyn", "mask", "label", "times", "lag", "seq_len","priv", "nex")
    return dataset

In [4]:
task = "fullhos2"
if task != "":
    use_path = '../inhospital_result/{}/train'.format(task)
    name_lis = pd.read_csv("{}/listfile.csv".format(use_path), sep=',')
else:
    use_path = '../mimic3-benchmarks/data/in-hospital-mortality/train'
    name_lis = pd.read_csv("{}/listfile.csv".format(use_path), sep=',')

data = get_dataset(use_path, name_lis)

def clamp(data):
    seq_len = [len(x) for x in data]
    print(max(seq_len))
    dynamics = pd.concat(data)

    for col, x, y in limit:
        if col == "Capillary refill rate": continue
        if x==x:
            dynamics[col][dynamics[col]<x] = np.nan
        if y==y:
            dynamics[col][dynamics[col]>y] = np.nan

    data = []
    st = 0
    for i in seq_len:
        data.append(dynamics.iloc[st:st+i])
        st+=i
    return data

if task == "":
    data=clamp(data)
    
dynamics_types = ["categorical" if str(x) in cat_name else "continuous" for x in data[0].columns]
dynamics_types[5] = 'int'
d_P = StdProcessor(dynamics_types, use_pri='Hours')
d_P.fit(pd.concat(data))
dataset = transform_dataset(data, name_lis)

17737
Counter()
Hours 1 None continuous None
Diastolic blood pressure 1 0.3026073125822528 continuous 55.14953231811523
Fraction inspired oxygen 1 0.9821948132684658 continuous 0.5188183784484863
Glascow coma scale eye opening 5 0.8054537982073892 categorical Spontaneously
Glascow coma scale motor response 6 0.8085821875431202 categorical Obeys Commands
Glascow coma scale total 1 0.8857956138750039 int 6.0
Glascow coma scale verbal response 5 0.8080124453069709 categorical Confused
Glucose 1 0.8229563093759691 continuous 177.68959045410156
Heart Rate 1 0.3068284497224732 continuous 76.982177734375
all values are integer
Height 1 0.9998531133297428 int 176.0
Mean blood pressure 1 0.32732433318644666 continuous 74.05464172363281
Oxygen saturation 1 0.2772478482586511 continuous 98.96013641357422
Respiratory rate 1 0.3018721373790967 continuous 21.3247013092041
all values are integer
Systolic blood pressure 1 0.3017074462639598 int 112.0
Temperature 1 0.7641623231239679 continuous 36.7475

In [5]:
test_path = '../mimic3-benchmarks/data/in-hospital-mortality/test'
test_lis = pd.read_csv("{}/listfile.csv".format(test_path), sep=',')
testdata = get_dataset(test_path, test_lis)
testdata = clamp(testdata)
test_set = transform_dataset(testdata, test_lis)

3209
Counter({0: 21, 1: 6})
150


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]<x] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]>y] = np.nan


In [6]:
if task == "":
    out = './data/inhospital/train1.pkl'
else:
    out = '../inhospital_result/{}/mimic1.pkl'.format(task)
with open(out, "wb") as f:
    pickle.dump({"train_set":dataset,"dynamic_processor":d_P, "test_set":test_set}, f)

In [7]:
dynamics = pd.concat(data)
for col in dynamics:
    if col not in cat_name:
        print("{}\t{}\t{}".format(col, dynamics[col].mean(), dynamics[col].std()))

Hours	21.72055305013985	14.563245291210707
Diastolic blood pressure	61.06673692018296	11.294080646814745
Fraction inspired oxygen	0.5103383442843081	0.14712084112572896
Glascow coma scale total	11.296872259573224	3.8669039504754057
Glucose	151.9883507471631	42.643960872857654
Heart Rate	85.70490040869849	9.378434954162941
Height	171.8939393939394	7.272356647224481
Mean blood pressure	80.01414389199014	13.084530372823366
Oxygen saturation	97.76064189557185	1.8592325307765045
Respiratory rate	18.75494863295268	3.262115085316025
Systolic blood pressure	121.0598364147658	19.365231654132227
Temperature	37.00235537285953	0.38318506736242897
Weight	81.53598092719915	30.365245245318746
pH	7.321998108021118	0.1428644032565164
