In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


# N-CMAPSS Dataloaders

In [10]:
class NCMAPSSTrainDataset(Dataset):
    def __init__(self, ds_no, timesteps=10):
        fileloc = self.get_fileloc(ds_no)
        X_train, y_train = load_dataframes(fileloc)
        
        self.features = X_train.columns
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        
        X_train = pd.DataFrame(X_train, columns=self.features)
        X_train = self.make_data_seq(X_train, timesteps)
        
        self.X = torch.Tensor(X_train)
        self.y = torch.Tensor(y_train)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.X.shape[0]
    
    def get_fileloc(self, ds_no):
        locations = {
            1: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS01-005.h5',
            2: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS02-006.h5',
            3: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS03-012.h5',
            4: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS04.h5',
            5: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS05.h5',
            6: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS06.h5',
            7: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS07.h5',
            8: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08a-009.h5',
            9: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08c-008.h5',
        }
        
        return locations[ds_no]
    
    def load_dataframes(self, fileloc):
        with h5py.File(fileloc, 'r') as hdf:
            # Development set
            W_dev = np.array(hdf.get('W_dev'))             # W
            X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
            X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
            T_dev = np.array(hdf.get('T_dev'))             # T
            Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
            A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

#             # Test set
#             W_test = np.array(hdf.get('W_test'))           # W
#             X_s_test = np.array(hdf.get('X_s_test'))       # X_s
#             X_v_test = np.array(hdf.get('X_v_test'))       # X_v
#             T_test = np.array(hdf.get('T_test'))           # T
#             Y_test = np.array(hdf.get('Y_test'))           # RUL  
#             A_test = np.array(hdf.get('A_test'))           # Auxiliary

            # Varnams
            W_var = np.array(hdf.get('W_var'))
            X_s_var = np.array(hdf.get('X_s_var'))  
            X_v_var = np.array(hdf.get('X_v_var')) 
            T_var = np.array(hdf.get('T_var'))
            A_var = np.array(hdf.get('A_var'))

            # from np.array to list dtype U4/U5
            W_var = list(np.array(W_var, dtype='U20'))
            X_s_var = list(np.array(X_s_var, dtype='U20'))  
            X_v_var = list(np.array(X_v_var, dtype='U20')) 
            T_var = list(np.array(T_var, dtype='U20'))
            A_var = list(np.array(A_var, dtype='U20'))
            
        X_train = pd.DataFrame(columns=np.concatenate((W_var, X_s_var, X_v_var, T_var, A_var), axis=0),
                              data=np.concatenate((W_dev, X_s_dev, X_v_dev, T_dev, A_dev), axis=1))
        
        y_train = pd.DataFrame(columns=['RUL'], data=Y_dev)
        
        return X_train, y_train
    
    def make_data_seq(self, df, n_timesteps):
        seq_data = []
        for unit in df['units'].unique():
            i = 0
            unit_data = df[df['units'] == unit]

            for index, row in unit_data.iterrows():
                seq_point = []
                pad_num = n_timesteps-i-1

                non_padded = np.expand_dims(np.array(unit_data[i-(n_timesteps-pad_num-1):i+1]), axis=0)

                if pad_num > 0:
                    padded = np.zeros((1, pad_num, row.shape[0]))
                    seq_point = np.concatenate([padded, non_padded], axis=1)
                else:
                    seq_point = non_padded

                seq_data.append(seq_point)
                i += 1


        seq_data = np.array(seq_data).squeeze(1)
        
        return seq_data

In [186]:
# This version only reads the relevant line from the file

class NCMAPSSTrainDataset(Dataset):
    def __init__(self, ds_no, timesteps=10):
        self.fileloc = self.get_fileloc(ds_no)
        self.timesteps = timesteps
        
    
    def __getitem__(self, index):
        start = index - self.timesteps + 1
        if start < 0:
            start = 0
        indices = list(range(start, index+1))
        
        with h5py.File(self.fileloc, 'r') as hdf:
            # Development set
            W_dev = np.array(hdf.get('W_dev')[indices])
            X_s_dev = np.array(hdf.get('X_s_dev')[indices])
            X_v_dev = np.array(hdf.get('X_v_dev')[indices])
            T_dev = np.array(hdf.get('T_dev')[indices])
            Y_dev = np.array(hdf.get('Y_dev')[index])
            A_dev = np.array(hdf.get('A_dev')[indices])
            
            unit = A_dev[-1:, 0]
            
        X_train = np.concatenate((W_dev, X_s_dev, X_v_dev, T_dev, A_dev), axis=1)
        n_pad = self.timesteps - X_train.shape[0]
        X_train = np.pad(X_train, ((n_pad, 0),(0,0)), mode='constant')
        
        for i, row in enumerate(X_train):
            curr_unit = X_train[i,42]
            if curr_unit != unit:
                X_train[i] = np.zeros_like(row)
        
        return X_train, Y_dev
    
    def __len__(self):
        return 1000 # implement this dynamically
    
    def get_fileloc(self, ds_no):
        locations = {
            1: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS01-005.h5',
            2: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS02-006.h5',
            3: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS03-012.h5',
            4: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS04.h5',
            5: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS05.h5',
            6: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS06.h5',
            7: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS07.h5',
            8: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08a-009.h5',
            9: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08c-008.h5',
        }
        
        return locations[ds_no]

In [187]:
traindata = NCMAPSSTrainDataset(1, timesteps=10)
trainloader = DataLoader(traindata, batch_size=32, shuffle=False)

In [188]:
example = next(enumerate(trainloader))

In [189]:
ex_X = example[1][0]
ex_y = example[1][1]

In [190]:
ex_X.shape

torch.Size([32, 10, 46])

In [191]:
ex_y.shape

torch.Size([32, 1])

In [192]:
ex_y

tensor([[99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99],
        [99]])