### Defining some basic information about the dataset

In [312]:
import pandas as pd
import glob
from random import randint
import numpy as np
import boto3

# Remove columns that have a STD less than
MINIMUM_STD = 0.00001

# Sensor baseline thresholds to loop through
SENSOR_BASELINE_THRESHOLDS = [50]

# Define columns
DF_COLUMNS = ["ENGINE_NUMBER", "TIME_IN_CYCLES"] + \
             ["OPERATIONAL_SETTING_{}".format(x) for x in range(1,4)] + \
             ["SENSOR_MEASUREMENT_{}".format(x) for x in range(1,24)]

# Define data paths and data names
DATA_PATH = "/home/ec2-user/SageMaker/aws-sagemaker-test/data/"
OUTDATA_PATH = "/home/ec2-user/SageMaker/aws-sagemaker-test/engine_data/"
DS_FILENAME = DATA_PATH + "{}_FD00{}.txt"


### Defining funcitons to load data and run feature engineering stuff

In [313]:
# Carolyns sensor baseline.. 
def get_sensor_baseline(df):
    for thres in SENSOR_BASELINE_THRESHOLDS:
        for column in [col for col in df.columns if 'SENSOR_MEASUREMENT' in col]:
            data = df[['TIME_IN_CYCLES', column]]

            # Get baseline average value (if <50 cycles, make it average to that point in time, 
            # Else make it average of first 50 cycles)
            data.loc[:,'SENSOR_BASELINE_AVG'] = np.where(
                    data['TIME_IN_CYCLES'] < thres,
                    data[column].expanding(min_periods=0, axis=0).mean(),
                    np.mean(df[column][0:thres])
            )

            # Get baseline standard deviation, with same logic
            data.loc[:,'SENSOR_BASELINE_STD'] = np.where(
                    data['TIME_IN_CYCLES'] < thres,
                    data[column].expanding(min_periods=0, axis=0).std(),
                    np.std(df[column][0:thres])
            )

           # Define new column
            df.loc[:,'BASELINE_{}TS_{}'.format(thres, column)] = (
                    (data[column] - data.SENSOR_BASELINE_AVG) / data.SENSOR_BASELINE_STD.fillna(1)
            ).fillna(0)
            
            # Get a rolling avg of the sensor values, too
            df.loc[:,'ROLLINGMEAN_{}TS_{}'.format(thres, column)] = (
                data[column].rolling(10).mean()
            )
            
            # And a rolling avg of std dev from baseline
            df.loc[:,'BASE_ROLL_{}TS_{}'.format(thres, column)] = (
                df.loc[:,'BASELINE_{}TS_{}'.format(thres, column)].rolling(10).mean()
            )
            
    return df

# Function to load some RUL data
def load_rul(data_path):  
    df = pd.read_csv(data_path, header=None, names=['RUL'])
    df['ENGINE_NUMBER'] = np.arange(1, len(df) + 1)
    return df

# Function to filter data that doesn't meet a certain criteria
def drop_bad_columns(dataframe):
    """
    Remove columns where the STD is less than MINIMUM_STD (only sensor data... not settings)
    """
    df = dataframe.describe().T.reset_index()
    for _,data in df.iterrows():
        if abs(data['std']) <= MINIMUM_STD and 'SENSOR' in data['index']:
            del dataframe[data['index']]
    return dataframe.reset_index(drop=True)

# Function to load some data
def load_data(data_path, filter_data=False, feature_engineer=False):  
    """
    Load data in
    """
    data = pd.read_csv(data_path, sep=' ', header=None, names=DF_COLUMNS)
    data = data.drop(DF_COLUMNS[-2:], axis=1)
#     data['TIME'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='600s')
    if filter_data:
        data = drop_bad_columns(data)
    if feature_engineer:
        data = get_sensor_baseline(data)
    return data

### Load train, test, RUL data for dataset 1 and 2

In [314]:
# Load dataset 1
train001 = load_data(DS_FILENAME.format('train', '1'), filter_data=True, feature_engineer=True)
test001 = load_data(DS_FILENAME.format('test', '1'), filter_data=True, feature_engineer=True)
rul001 = load_rul(DS_FILENAME.format('RUL', '1'))

# Load dataset 2
train002 = load_data(DS_FILENAME.format('train', '2'), filter_data=True, feature_engineer=True)
test002 = load_data(DS_FILENAME.format('test', '2'), filter_data=True, feature_engineer=True)
rul002 = load_rul(DS_FILENAME.format('RUL', '2'))

### Feature engineering on training
Define function to find the max number of cycles per engine, then for each cycle determine how many RUL remain.

We delete ENGINE_NUMBER becuase we do not want it as an input variable

In [315]:
def compute_rul(df):
    # Get max cycle per engine
    max_cycle = df.groupby('ENGINE_NUMBER').TIME_IN_CYCLES.max().reset_index()
    max_cycle.columns = ['ENGINE_NUMBER', 'MAX_CYCLES']
        
    # Merge onto data and reorganize the columns
    df = pd.merge(df, max_cycle, on='ENGINE_NUMBER')
    df['RUL'] = df.MAX_CYCLES - df.TIME_IN_CYCLES
    del df['MAX_CYCLES']
    df = df[['RUL'] + [col for col in df.columns if col != 'RUL']]
    del df['ENGINE_NUMBER']
    return df

In [316]:
# Compute RUL per engine for each cycle
train001 = compute_rul(train001)
train002 = compute_rul(train002)

### Feature engineering on testing
Define function to combine the test file with the RUL data, and select the last cycle per engine

In [317]:
def combine_test_rul(test, rul):
    # Only keep last row per engine number
    df = test[test.groupby('ENGINE_NUMBER')['TIME_IN_CYCLES'].transform(max) == test['TIME_IN_CYCLES']]
    df = pd.merge(df, rul, on='ENGINE_NUMBER')
    df = df[['RUL'] + [col for col in df.columns if col != 'RUL']]
    del df['ENGINE_NUMBER']
    return df

In [318]:
# Run combine code
test001 = combine_test_rul(test001, rul001)
test002 = combine_test_rul(test002, rul002)

### Write out new data to s3

In [319]:
bucket = 'bryan-predictive-maintenance'

def write_to_csv(df, fname):
    # Change column order and save file locally
    df.to_csv(fname, index=False, header=False)
    
    # Create connection
    s3conn = boto3.client('s3')
    
    # Write file
    outfile = 'sagemaker/{}'.format(fname.split('/')[-1])
    s3conn.put_object(
            Body=open(fname),
            Bucket=bucket,
            Key=outfile
        )  

In [320]:
# Write all files to csv
write_to_csv(train001, OUTDATA_PATH + 'train001.csv')
write_to_csv(train002, OUTDATA_PATH + 'train002.csv')
write_to_csv(test001, OUTDATA_PATH + 'test001.csv')   
write_to_csv(test002, OUTDATA_PATH + 'test002.csv') 