# Data Preparation for Engine Predictive Maintenance Examples

WARNING: parameter names sourced from MATLAB predictive maintenance webinar. Information is not cross confirmed (NASA data source names it "sensor measurement x").

WARNING: data in the 4 different datasets provided are not homogeneous. Demo is developed using data in train/test_FD001.txt

In [1]:
import os
import requests
import zipfile
import pandas as pd

In [2]:
COLUMNS = ['Unit', 'Cycle', 'OpSet1', 'OpSet2', 'OpSet3', 
                'FanInletTemp', 'LPCOutletTemp', 'HPCOutletTemp', 'LPTOutletTemp', 
                'FanInletPres', 'BypassDuctPres', 'TotalHPCOutletPres', 
                'PhysFanSpeed', 'PhysCoreSpeed', 'EnginePresRatio',
                'StaticHPCOutletPres', 'FuelFlowRatio', 
                'CorrFanSpeed', 'CorrCoreSpeed',
                'BypassRatio', 'BurnerFuelAirRatio', 
                'BleedEnthalpy', 'DemandFanSpeed', 
                'DemandCorrFanSpeed', 'HPTCoolantBleed', 'LPTCoolantBleed'
               ]

COLUMNS_W_RUL = COLUMNS + ['RUL']

SUFFIXES = ['FD001.txt'] # Select only 1 dataset
#SUFFIXES = ['FD001.txt', 'FD002.txt', 'FD003.txt', 'FD004.txt'] # Select full data (not recommended)

data_root = 'data/'
original_dir = data_root + 'original/'
dataset_dir = data_root + 'dataset/'
filename = 'CMAPSSData.zip'
original_file = original_dir + filename
url = 'http://ti.arc.nasa.gov/c/6'

if not os.path.exists(data_root):
    os.mkdir(data_root)

if not os.path.exists(data_root + 'original/'):
    os.mkdir(data_root + 'original/')
    
if not os.path.exists(original_file):
    print('Downloading data...')
    r = requests.get(url, allow_redirects=True)
    with open(original_file, 'wb') as f:
        f.write(r.content)
    print('Done.')
else:
    print('Found original data file.')

Found original data file.


In [3]:
try:
    with zipfile.ZipFile(original_file) as z:
        z.extractall(original_dir)
        print('Extracted all files')
except:
    print('Bad file')

Extracted all files


In [4]:
data = pd.read_csv(original_dir + 'train_FD001.txt', sep=' ', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,,
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,,
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,,
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,,


In [5]:
def format_engine_data(df):
    # Cleanup and label the columns
    df.dropna(axis=1, how='any', inplace=True)
    df.columns = COLUMNS

    # Compute and add Remaining Useful Life in a new column
    last_cycles = df.groupby(['Unit'], sort=False)['Cycle'].max().reset_index().rename(columns={'Cycle': 'LastCycle'})
    df = pd.merge(df, last_cycles, how='inner', on='Unit')
    df['RUL'] = df['LastCycle'] - df['Cycle']
    df.drop(columns={'LastCycle'}, inplace=True)
    
    return df

data = format_engine_data(data)
data

Unnamed: 0,Unit,Cycle,OpSet1,OpSet2,OpSet3,FanInletTemp,LPCOutletTemp,HPCOutletTemp,LPTOutletTemp,FanInletPres,...,CorrFanSpeed,CorrCoreSpeed,BypassRatio,BurnerFuelAirRatio,BleedEnthalpy,DemandFanSpeed,DemandCorrFanSpeed,HPTCoolantBleed,LPTCoolantBleed,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735,4
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594,3
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333,2
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640,1


In [6]:
if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)

train_data = pd.DataFrame()

for suffix in SUFFIXES:
    df = pd.read_csv(original_dir + 'train_' + suffix, sep=' ', header=None)
    df = format_engine_data(df)
    train_data = train_data.append(df)
    
train_data.columns = COLUMNS_W_RUL
    
train_data.to_csv(dataset_dir + 'train_data.csv', index=None)
print('Training data ready')

Training data ready


In [7]:
test_data = pd.DataFrame()

for suffix in SUFFIXES:
    df = pd.read_csv(original_dir + 'test_' + suffix, sep=' ', header=None)
    df = format_engine_data(df)
    df = df.groupby('Unit').last()
    
    rul = pd.read_csv(original_dir + 'RUL_' + suffix, header=None)
    rul.columns={'RUL'}
    rul.index = rul.index + 1
    df['RUL'] = rul['RUL']
    
    test_data = test_data.append(df)

COLUMNS_W_RUL.pop(0)
test_data.columns = COLUMNS_W_RUL
test_data.to_csv(dataset_dir + 'test_data.csv')

print('Test data ready')

Test data ready
