In [1]:
import os
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
plt.style.use('default')
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

## Train set

In [2]:
## According to competition FAQ, use all files except N-CMAPSS_DS02-006 and N-CMAPSS_DS08d-010
filenames = ['N-CMAPSS_DS01-005.h5','N-CMAPSS_DS03-012.h5', 'N-CMAPSS_DS04.h5', 'N-CMAPSS_DS05.h5', 'N-CMAPSS_DS06.h5','N-CMAPSS_DS07.h5','N-CMAPSS_DS08a-009.h5', 'N-CMAPSS_DS08c-008.h5']

In [3]:
A_var = ['unit', 'cycle', 'Fc', 'hs']
X_s_var =  ['T24',
            'T30',
            'T48',
            'T50',
            'P15',
            'P2',
            'P21',
            'P24',
            'Ps30',
            'P40',
            'P50',
            'Nf',
            'Nc',
            'Wf']
W_var = ['alt', 'Mach', 'TRA', 'T2']

In [4]:
train_df = pd.DataFrame()
unit_numbers = [0,10,25,35,45,55,65,80]

for j, filename in enumerate(filenames):

    print('Loading file', filename)

    with h5py.File(filename, 'r') as hdf:

        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # "Test" set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary

        W = np.concatenate((W_dev, W_test), axis=0)  
        X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
        Y = np.concatenate((Y_dev, Y_test), axis=0) 
        A = np.concatenate((A_dev, A_test), axis=0)

    df = pd.DataFrame(data=A, columns=A_var)
    for i, varname in enumerate(X_s_var):
        df[varname] = X_s[:,i]
    for i, varname in enumerate(W_var):
        df[varname] = W[:,i]

    df['unit'] += unit_numbers[j]
    df['RUL'] = Y

    if len(train_df) == 0:
        train_df = df
    else:
        train_df = pd.concat([train_df,df],axis=0)


Loading file N-CMAPSS_DS01-005.h5
Loading file N-CMAPSS_DS03-012.h5
Loading file N-CMAPSS_DS04.h5
Loading file N-CMAPSS_DS05.h5
Loading file N-CMAPSS_DS06.h5
Loading file N-CMAPSS_DS07.h5
Loading file N-CMAPSS_DS08a-009.h5
Loading file N-CMAPSS_DS08c-008.h5


In [5]:
train_df.unit.unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52.,
       53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
       66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78.,
       79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90.])

In [6]:
## This mainly reduces file size by chaning sensor values from float64 to float16 -> significant reduction in size.
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [9]:
train_df = reduce_mem_usage(train_df)

Memory usage of dataframe is 11605.79 MB
Memory usage after optimization is: 3203.68 MB
Decreased by 72.4%


In [10]:
## Downsample by factor 100 (!!)
for unit in train_df.unit.unique():
  train_df.loc[train_df['unit']==unit] = train_df[train_df['unit']==unit].iloc[::100]

train_df = train_df.dropna()

## Test set

In [11]:
filename = 'Data_Challenge_2021_N-CMAPSS_DS_Validation_f.h5'
with h5py.File(filename, 'r') as hdf:
        # Development set
        W_val = np.array(hdf.get('W_val'))             # W
        X_s_val = np.array(hdf.get('X_s_val'))         # X_s
        A_val = np.array(hdf.get('A_val'))             # Auxiliary

# Create dataframe with Aux array
test_df = pd.DataFrame(data=A_val, columns=A_var)

# Sensor values
for i,varname in enumerate(X_s_var):
    test_df[varname] = X_s_val[:,i]

# Flight settings
for i,varname in enumerate(W_var):
    test_df[varname] = W_val[:,i]  

In [12]:
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 2807.42 MB
Memory usage after optimization is: 701.85 MB
Decreased by 75.0%


In [13]:
for unit in test_df.unit.unique():
  test_df.loc[test_df['unit']==unit] = test_df[test_df['unit']==unit].iloc[::100]

test_df = test_df.dropna()

## Save

In [14]:
train_df.to_pickle('train_df.pkl')
test_df.to_pickle('test_df.pkl')