# Nazwy kolumn w ramce danych zapisanej w pliku *X48.npy*

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
features = np.load("features", allow_pickle = True)
labels = np.load("labels", allow_pickle = True)

Jedna z funkcji do preprocessingu w artykule:

`
def preprocess(features, labels, demographics):
    '''pre: features and labels
    post: X = [[x1, ... xT]_1, ...], y= [(mort, readm, los, dx)] '''
    from sklearn.preprocessing import MinMaxScaler
    subj = list(set(labels.keys()))   
    hadm = list(set(features.keys()))
    col_dict = dict ([(v,k) for k,v in enumerate(features[hadm[0]][list(features[hadm[0]].keys())[0]].keys())])
    cols = sorted(col_dict.keys())
    items = []
    for i in progressbar.progressbar(range(len( subj ) ) ):
        s = subj[i]
        h = labels[s]['hadm_id']
        if h in hadm:
            x = np.zeros((len(features[h].keys()), len(col_dict)))
            for index in range(len(sorted(features[h].keys()))):
                t = sorted(features[h].keys())[index]
                x[index, [col_dict[k] for k in cols]] = [features[h][t][k] for k in cols]
            mort = labels[s]['mort']
            los = list(one_hot([labels[s]['los_bin']], 9)[0])
            readmit = labels[s]['readmit']
            dx = labels[s]['dx']
            y = (mort, readmit, los, dx)
            z = demographics[s]
            #auxiliary features
            x48 = np.concatenate((np.min(x, axis=0), np.max(x, axis=0), np.mean(x,axis=0), np.std(x,axis=0)),axis=-1)            
            sentence = labels[s]['dx_lst']
            items.append((x, y, z, x48, sentence))
    X, y, Z, X48, sentences = zip(*items)
    X, y, Z, X48, sentences = np.array(list(X)), list(y), np.array(list(Z)), np.array(list(X48)), list(sentences)
    #normalize each feature to [0,1]
    words = [[] for i in range(len(X))]
    for i in range(len(X[0,0,:])):
        #add to visit words
        mean, std, minimum, maximum = np.mean(X[:,:,i]), np.std(X[:,:,i]), np.min(X[:,:,i],axis=1), np.max(X[:,:,i], axis=1) 
        arr_min, arr_max = minimum < (mean - std), maximum > (mean + std)
        for j in range(len(arr_min)):
            if arr_min[j]: words[j].append(str(i) + '_low')
            if arr_max[j]: words[j].append(str(i) + '_high')
        #scale X
        scaler = MinMaxScaler()
        x_row = scaler.fit_transform(X[:,:,i])
        X[:,:,i] = x_row
    #transform X48
    scaler = MinMaxScaler()
    X48 = scaler.fit_transform(X48)
    return X, y, Z, X48, sentences, words `
    

### Odzyskanie nazw kolumn:

In [3]:
subj = list(set(labels.keys()))   
hadm = list(set(features.keys()))
col_dict = dict ([(v,k) for k,v in enumerate(features[hadm[0]][list(features[hadm[0]].keys())[0]].keys())])
cols = sorted(col_dict.keys())

print(cols)

['albumin', 'bicarbonate', 'bun', 'calcium', 'creatinine', 'diasbp', 'glucose', 'heartrate', 'inr', 'lactate', 'pco2', 'ph', 'platelet', 'potassium', 'resprate', 'sodium', 'spo2', 'sysbp', 'tempc']


Spójrzmy jednak na linijkę:

` x48 = np.concatenate((np.min(x, axis=0), np.max(x, axis=0), np.mean(x,axis=0), np.std(x,axis=0)),axis=-1) `

Oznacza to, że dla każdej z wartości policzono wartość minimalną, maksymalną, średnią oraz odchylenie standardowe. Liczba kolumn wynosi:

In [4]:
n_cols = len(cols)
print(f"Liczba kolumn wynosi: {n_cols * 4}")

Liczba kolumn wynosi: 76


In [5]:
X48 = np.load("X48.npy")
print(f"Liczba kolumn w ramce X48 wynosi: {X48.shape[1]}")

Liczba kolumn w ramce X48 wynosi: 76


A zatem są to wszystkie kolumny.

In [6]:
column_names = []

for col in cols:
    column_names.append(col + "_min")
    column_names.append(col + "_max")
    column_names.append(col + "_mean")
    column_names.append(col + "_std")

In [9]:
print(column_names)

['albumin_min', 'albumin_max', 'albumin_mean', 'albumin_std', 'bicarbonate_min', 'bicarbonate_max', 'bicarbonate_mean', 'bicarbonate_std', 'bun_min', 'bun_max', 'bun_mean', 'bun_std', 'calcium_min', 'calcium_max', 'calcium_mean', 'calcium_std', 'creatinine_min', 'creatinine_max', 'creatinine_mean', 'creatinine_std', 'diasbp_min', 'diasbp_max', 'diasbp_mean', 'diasbp_std', 'glucose_min', 'glucose_max', 'glucose_mean', 'glucose_std', 'heartrate_min', 'heartrate_max', 'heartrate_mean', 'heartrate_std', 'inr_min', 'inr_max', 'inr_mean', 'inr_std', 'lactate_min', 'lactate_max', 'lactate_mean', 'lactate_std', 'pco2_min', 'pco2_max', 'pco2_mean', 'pco2_std', 'ph_min', 'ph_max', 'ph_mean', 'ph_std', 'platelet_min', 'platelet_max', 'platelet_mean', 'platelet_std', 'potassium_min', 'potassium_max', 'potassium_mean', 'potassium_std', 'resprate_min', 'resprate_max', 'resprate_mean', 'resprate_std', 'sodium_min', 'sodium_max', 'sodium_mean', 'sodium_std', 'spo2_min', 'spo2_max', 'spo2_mean', 'spo2_

In [7]:
with open('column_names.npy', 'wb') as f:
    pickle.dump(column_names, f)