# Data Preparation for Engine Predictive Maintenance Examples

WARNING: parameter names sourced from MATLAB predictive maintenance webinar. Information is not cross confirmed (NASA data source names it "sensor measurement x").

WARNING: data in the 4 different datasets provided are not homogeneous. Demo is developed using data in train/test_FD001.txt

In [None]:
import os
import requests
import zipfile
import pandas as pd

In [None]:
COLUMNS = ['Unit', 'Cycle', 'OpSet1', 'OpSet2', 'OpSet3', 
                'FanInletTemp', 'LPCOutletTemp', 'HPCOutletTemp', 'LPTOutletTemp', 
                'FanInletPres', 'BypassDuctPres', 'TotalHPCOutletPres', 
                'PhysFanSpeed', 'PhysCoreSpeed', 'EnginePresRatio',
                'StaticHPCOutletPres', 'FuelFlowRatio', 
                'CorrFanSpeed', 'CorrCoreSpeed',
                'BypassRatio', 'BurnerFuelAirRatio', 
                'BleedEnthalpy', 'DemandFanSpeed', 
                'DemandCorrFanSpeed', 'HPTCoolantBleed', 'LPTCoolantBleed'
               ]

COLUMNS_W_RUL = COLUMNS + ['RUL']

SUFFIXES = ['FD001.txt'] # Select only 1 dataset
#SUFFIXES = ['FD001.txt', 'FD002.txt', 'FD003.txt', 'FD004.txt'] # Select full data (not recommended)

data_root = 'data/'
original_dir = data_root + 'original/'
dataset_dir = data_root + 'dataset/'
filename = 'CMAPSSData.zip'
original_file = original_dir + filename
url = 'http://ti.arc.nasa.gov/c/6'

if not os.path.exists(data_root):
    os.mkdir(data_root)

if not os.path.exists(data_root + 'original/'):
    os.mkdir(data_root + 'original/')
    
if not os.path.exists(original_file):
    print('Downloading data...')
    r = requests.get(url, allow_redirects=True)
    with open(original_file, 'wb') as f:
        f.write(r.content)
    print('Done.')
else:
    print('Found original data file.')

In [None]:
try:
    with zipfile.ZipFile(original_file) as z:
        z.extractall(original_dir)
        print('Extracted all files')
except:
    print('Bad file')

In [None]:
data = pd.read_csv(original_dir + 'train_FD001.txt', sep=' ', header=None)
data

In [None]:
def format_engine_data(df):
    # Cleanup and label the columns
    df.dropna(axis=1, how='any', inplace=True)
    df.columns = COLUMNS

    # Compute and add Remaining Useful Life in a new column
    last_cycles = df.groupby(['Unit'], sort=False)['Cycle'].max().reset_index().rename(columns={'Cycle': 'LastCycle'})
    df = pd.merge(df, last_cycles, how='inner', on='Unit')
    df['RUL'] = df['LastCycle'] - df['Cycle']
    df.drop(columns={'LastCycle'}, inplace=True)
    
    return df

data = format_engine_data(data)
data

In [None]:
if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)

train_data = pd.DataFrame()

for suffix in SUFFIXES:
    df = pd.read_csv(original_dir + 'train_' + suffix, sep=' ', header=None)
    df = format_engine_data(df)
    train_data = train_data.append(df)
    
train_data.columns = COLUMNS_W_RUL
    
train_data.to_csv(dataset_dir + 'train_data.csv', index=None)
print('Training data ready')

In [None]:
test_data = pd.DataFrame()

for suffix in SUFFIXES:
    df = pd.read_csv(original_dir + 'test_' + suffix, sep=' ', header=None)
    df = format_engine_data(df)
    df = df.groupby('Unit').last()
    
    rul = pd.read_csv(original_dir + 'RUL_' + suffix, header=None)
    rul.columns={'RUL'}
    rul.index = rul.index + 1
    df['RUL'] = rul['RUL']
    
    test_data = test_data.append(df)

COLUMNS_W_RUL.pop(0)
test_data.columns = COLUMNS_W_RUL
test_data.to_csv(dataset_dir + 'test_data.csv')

print('Test data ready')