In [1]:
import os
import numpy as np
import pandas as pd

# SETTINGS
seed = 124
np.random.seed(seed)
np.set_printoptions(precision=14)

test_size = 0.2

In [2]:
def get_data():

    ## train & test paths
    data_dir = "./data/"

    ### csv w/ layout: "y,x1,x2,x3,x4,x5"
    train_features_file = data_dir+"train_features.csv"
    train_labels_file = data_dir+"train_labels.csv"
    test_features_file = data_dir+"test_features.csv"
    
    data_type = np.double # np dtype object for values in X and Y

    train_features = pd.read_csv(train_features_file)
    train_labels = pd.read_csv(train_labels_file)
    test_features = pd.read_csv(test_features_file)

    return train_features, train_labels, test_features

In [3]:
train_features, train_labels, test_features = get_data()

In [4]:
print(train_features.head())
print(train_features.columns)
train_features.info()

   pid  Time   Age  EtCO2  PTT   BUN  Lactate  Temp  Hgb  HCO3  ...  \
0    1     3  34.0    NaN  NaN  12.0      NaN  36.0  8.7  24.0  ...   
1    1     4  34.0    NaN  NaN   NaN      NaN  36.0  NaN   NaN  ...   
2    1     5  34.0    NaN  NaN   NaN      NaN  36.0  NaN   NaN  ...   
3    1     6  34.0    NaN  NaN   NaN      NaN  37.0  NaN   NaN  ...   
4    1     7  34.0    NaN  NaN   NaN      NaN   NaN  NaN   NaN  ...   

   Alkalinephos   SpO2  Bilirubin_direct  Chloride   Hct  Heartrate  \
0           NaN  100.0               NaN     114.0  24.6       94.0   
1           NaN  100.0               NaN       NaN   NaN       99.0   
2           NaN  100.0               NaN       NaN   NaN       92.0   
3           NaN  100.0               NaN       NaN   NaN       88.0   
4           NaN  100.0               NaN       NaN  22.4       81.0   

   Bilirubin_total  TroponinI   ABPs    pH  
0              NaN        NaN  142.0  7.33  
1              NaN        NaN  125.0  7.33  
2          

In [5]:
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

def reshape(train_features):
    preprocessed_train_features_arr = []
    grouped_pid = train_features.groupby("pid")
    for pid, group in tqdm(grouped_pid):
        # print(group)

        row_arr = []
        min_time = group['Time'].min()
        means = group.mean()
        group.fillna(means, inplace=True)
        for i, row in group.iterrows():
            # row = row.to_frame()
            age = row["Age"]
            time = int(row["Time"] - min_time)
            row = row.drop(["pid", "Age", "Time"])
            row = row.add_suffix(f"_{str(time)}")
            row_arr.append(row)

        pid_age_df = pd.DataFrame({"pid": pid, "Age": float(age)}, index=[0]).T
        
        # column_names = [f"{label}_{i}" for i in range(12) for label in pid_age_df.columns ].append("pid", "Age")
        row_arr.append(pid_age_df)
        final_row = pd.concat(row_arr)
        # print("row_arr", final_row, "end_row_arr", len(row_arr))

        preprocessed_train_features_arr.append(final_row.T)

    preprocessed_train_features = pd.concat(preprocessed_train_features_arr, axis=0, ignore_index=True)

    return preprocessed_train_features


In [6]:

pd.set_option('display.max_columns', None)
preprocessed_train_features = reshape(train_features)
preprocessed_test_features = reshape(test_features)

preprocessed_train_features.to_csv("./data/preprocessed_train_features.csv", index=False)
preprocessed_test_features.to_csv("./data/preprocessed_test_features.csv", index=False)
print(preprocessed_train_features.sort_index(axis=1).head())

100%|██████████| 18995/18995 [10:05<00:00, 31.39it/s]
100%|██████████| 12664/12664 [5:01:28<00:00,  1.43s/it]      


       ABPd_0  ABPd_1  ABPd_10  ABPd_11  ABPd_2  ABPd_3  ABPd_4  ABPd_5  \
0   61.000000    62.5     49.0     39.0    59.0    49.5    48.0    51.0   
1   70.090909    77.0     71.0     76.0    58.0    66.0    79.0    62.0   
2   49.600000    50.0     44.0     44.0    52.0    49.6    70.0    48.0   
3   46.000000    45.0     50.0     51.0    42.0    50.5    55.5    47.5   
4  108.954545   107.5    142.0    118.0   100.0    98.0   111.0   100.0   

   ABPd_6  ABPd_7  ABPd_8  ABPd_9      ABPm_0  ABPm_1  ABPm_10  ABPm_11  \
0    46.0    47.0    47.0    44.0   84.000000    81.0     69.0     53.0   
1    72.0    72.0    65.0    73.0   94.636364   106.0     94.0     99.0   
2    56.0    50.0    42.0    40.0   80.909091    74.0     60.0     62.0   
3    50.5    51.0    49.0    48.0   61.000000    60.0     68.0     67.0   
4   115.0   100.5   116.0    90.5  143.900000   138.0    169.0    154.0   

   ABPm_2  ABPm_3  ABPm_4  ABPm_5  ABPm_6  ABPm_7  ABPm_8  ABPm_9      ABPs_0  \
0    74.0    66.0