In [1]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
import sys
sys.path.append("general")

orders = ['Hours', 'Capillary refill rate', 'Diastolic blood pressure',
       'Fraction inspired oxygen', 'Glascow coma scale eye opening',
       'Glascow coma scale motor response', 'Glascow coma scale total',
       'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height',
       'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate',
       'Systolic blood pressure', 'Temperature', 'Weight', 'pH']

count = pd.read_csv('../mimic3-benchmarks/mimic3benchmark/resources/variable_ranges.csv', sep=',')
limit = []
cat_name = ["Glascow coma scale eye opening",
"Glascow coma scale motor response",
#"Glascow coma scale total",
"Glascow coma scale verbal response"]
for col in orders:
    if len(count[(count['LEVEL2']==col)])==1 and col not in cat_name:
        x = float(count[(count['LEVEL2']==col)]['OUTLIER LOW'])
        y = float(count[(count['LEVEL2']==col)]['OUTLIER HIGH'])
        limit.append((col, x, y))
        print(col, x, y)

refine_eye = {
     "None": "None",
     "1 No Response": "1 No Response",
     "2 To pain": "To Pain", 
     "To Pain": "To Pain",
     "3 To speech": "To Speech", 
     "To Speech": "To Speech",
     "4 Spontaneously": "Spontaneously",
     "Spontaneously": "Spontaneously"
}
refine_motor = {
    "1 No Response": "No response",
    "No response": "No response",
    "2 Abnorm extensn": "Abnormal extension",
    "Abnormal extension": "Abnormal extension",
    "3 Abnorm flexion": "Abnormal Flexion",
    "Abnormal Flexion": "Abnormal Flexion",
    "4 Flex-withdraws": "Flex-withdraws",
    "Flex-withdraws": "Flex-withdraws",
    "5 Localizes Pain": "Localizes Pain",
    "Localizes Pain": "Localizes Pain",
    "6 Obeys Commands": "Obeys Commands",
    "Obeys Commands": "Obeys Commands"
}
refine_verb = {
    "No Response-ETT": "No Response",
    "No Response": "No Response",
    "1 No Response": "No Response",
    "1.0 ET/Trach": "No Response",
    "2 Incomp sounds": "Incomprehensible sounds",
    "Incomprehensible sounds": "Incomprehensible sounds",
    "3 Inapprop words": "Inappropriate Words",
    "Inappropriate Words": "Inappropriate Words",
    "4 Confused": "Confused",
    "Confused": "Confused",
    "5 Oriented": "Oriented",
    "Oriented": "Oriented"
}

Capillary refill rate 0.0 1.0
Diastolic blood pressure 0.0 375.0
Fraction inspired oxygen 0.2 1.1
Glascow coma scale total 3.0 15.0
Glucose 0.0 2200.0
Height 0.0 275.0
Mean blood pressure 0.0 375.0
Oxygen saturation 0.0 150.0
Respiratory rate 0.0 330.0
Systolic blood pressure 0.0 375.0
Temperature 14.2 47.0
Weight 0.0 250.0
pH 6.3 10.0


In [2]:
def get_dataset(use_path, name_lis):
    data = []
    h = Counter()
    w = Counter()
    l = Counter()
    height = []
    drop_lis=[]
    for i,(fi,label) in enumerate(zip(name_lis["stay"],name_lis["y_true"])):
        df = pd.read_csv("{}/{}".format(use_path, fi), sep=',')
        if len(df)>150:
            l[label]+=1
            drop_lis.append(i)
            continue
            
        h[df["Height"].count()] += 1
        w[df["Weight"].count()] += 1
        """ 
        y = float("nan")
        for x in df["Height"].unique():
            if x==x:
                y = x
                break
        height.append(y)
        """
        if not df["Glascow coma scale eye opening"].isna().all():
            df["Glascow coma scale eye opening"].replace(refine_eye, inplace=True)
        if not df["Glascow coma scale motor response"].isna().all():
            df["Glascow coma scale motor response"].replace(refine_motor, inplace=True)
        if not df["Glascow coma scale verbal response"].isna().all():
            df["Glascow coma scale verbal response"].replace(refine_verb, inplace=True)
        #df.drop(columns = ["Height"], inplace=True)
        data.append(df)
        
    name_lis.drop(labels=drop_lis, inplace=True)
    #name_lis["Height"] = np.array(height, dtype=object)
    print(len(name_lis))
    print(l)
    return data


In [3]:
raw = pickle.load(open("./data/inhospital/fullhos.pkl", "rb"))
d_P = raw["dynamic_processor"] 

def transform_dataset(data, name_lis):
    y = name_lis["y_true"].values.reshape(-1, 1)
    d_lis=[d_P.transform(ds) for ds in data]
    seq_len = [len(ds) for ds in data]
    d = [x[0].tolist() for x in d_lis]
    lag = [x[1].tolist() for x in d_lis]
    mask = [x[2].tolist() for x in d_lis]
    times = [x[-1].tolist() for x in d_lis]
    priv = [x[3].tolist() for x in d_lis]
    nex = [x[4].tolist() for x in d_lis]

    from fastNLP import DataSet
    dataset = DataSet({"seq_len": seq_len, 
                       "dyn": d, "lag":lag, "mask": mask,
                       "label": y, "times":times, "priv":priv, "nex": nex
                      })
    dataset.set_input("dyn", "mask", "label", "times", "lag", "seq_len","priv", "nex")
    return dataset

def clamp(data):
    seq_len = [len(x) for x in data]
    print(max(seq_len))
    dynamics = pd.concat(data)

    for col, x, y in limit:
        if col not in dynamics.columns:
            print("Not", col)
            continue
        """
        if col == "Height":
            name_lis[col][name_lis[col]<x] = x
            name_lis[col][name_lis[col]>y] = y
            continue
        """
        if x==x:
            dynamics[col][dynamics[col]<x] = x
        if y==y:
            dynamics[col][dynamics[col]>y] = y

    data = []
    st = 0
    for i in seq_len:
        data.append(dynamics.iloc[st:st+i])
        st+=i
    return data

In [4]:
test_path = '../mimic3-benchmarks/data/in-hospital-mortality/test'
test_lis = pd.read_csv("{}/listfile.csv".format(test_path), sep=',')
testdata = get_dataset(test_path, test_lis)
testdata = clamp(testdata)
test_set = transform_dataset(testdata, test_lis)
with open('./data/inhospital/test_clamp.pkl', "wb") as f:
    pickle.dump(test_set, f)

3209
Counter({0: 21, 1: 6})
150


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]<x] = x
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]>y] = y


In [5]:
train_path = '../mimic3-benchmarks/data/in-hospital-mortality/train'
train_lis = pd.read_csv("{}/listfile.csv".format(train_path), sep=',')
traindata = get_dataset(train_path, train_lis)
traindata = clamp(traindata)
train_set = transform_dataset(traindata, train_lis)
with open('./data/inhospital/train_clamp.pkl', "wb") as f:
    pickle.dump(train_set, f)

17737
Counter({0: 112, 1: 54})
150


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]<x] = x
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dynamics[col][dynamics[col]>y] = y


In [6]:
task = ""
if task != "":
    use_path = '../inhospital_result/{}/train'.format(task)
    name_lis = pd.read_csv("{}/listfile.csv".format(use_path), sep=',')
    data = get_dataset(use_path, name_lis)
    dataset = transform_dataset(data, name_lis)
    with open('../inhospital_result/{}/mimic.pkl'.format(task), "wb") as f:
        pickle.dump(dataset, f)

In [7]:
raw_test = pd.concat(testdata)
for x in raw_test.columns:
    if x not in cat_name:
        print(x, raw_test[x].mean(), raw_test[x].std(), raw_test[x].min(), raw_test[x].max())
raw_train = pd.concat(traindata)
print()
for x in raw_test.columns:
    if x not in cat_name:
        print(x, raw_train[x].mean(), raw_train[x].std(), raw_train[x].min(), raw_train[x].max())

Hours 21.781130644740283 14.252372650561108 0.0 48.0
Capillary refill rate 0.14 0.34721858772760034 0.0 1.0
Diastolic blood pressure 60.18739280568959 14.607196681321296 0.0 375.0
Fraction inspired oxygen 0.5362834117308325 0.19448746857845922 0.2 1.0
Glascow coma scale total 11.66776108976598 3.8787003910854994 3.0 15.0
Glucose 144.60115940430225 68.90660649922293 0.0 1350.0
Heart Rate 87.00516834205564 18.247449997703093 0.0 224.0
Height 169.42019543973942 13.114478355589632 0.0 275.0
Mean blood pressure 78.13228708935897 16.299776179985543 0.0 375.0
Oxygen saturation 96.77808472463715 4.853223815412985 0.0 145.0
Respiratory rate 19.114454560752954 6.3456206300288 0.0 330.0
Systolic blood pressure 119.30580843866426 22.883304530202164 0.0 290.0
Temperature 37.00945097829877 0.8945723127786097 14.2 47.0
Weight 82.4782016974383 23.35586091730419 0.0 250.0
pH 7.313906697404685 0.2612172071373977 6.3 10.0

Hours 21.80471646142322 14.230958100593101 0.0 48.0
Capillary refill rate 0.119259